def __init__(self, model_name, env): super(DQNAgent, self).__init__(model_name, env) self.episode = self.configs.episode self.batch_size = self.configs.batch_size self.gamma = self.configs.gamma self.eps_start = self.configs.eps_start self.eps_end = self.configs.eps_end self.eps_decay = self.configs.eps_decay self.target_update_episode = self.configs.target_update_episode self.model_path = self.configs.save_path self.save_episode = self.configs.save_episode self.plot_episode = self.configs.plot_episode self.policy_net = models.DQN(self.configs, env).to(self.device) self.target_net = models.DQN(self.configs, env).to(self.device) self.load_model(self.model_path) self.optimizer = optim.Adam( self.policy_net.parameters(), lr=self.configs.optimizer_lr, betas=(self.configs.optimizer_beta1, self.configs.optimizer_beta2), eps=self.configs.optimizer_eps, weight_decay=self.configs.optimizer_weight_decay) self.memory = utils.ReplayMemory(10000) self.num_random_choose = 0 self.num_choice_per_dim = self.configs.num_choice_per_dim self.action_dim = env.action_spec().shape self.action_min = env.action_spec().minimum self.action_max = env.action_spec().maximum self.action_space = utils.enumerate(self.num_choice_per_dim, self.action_min, self.action_max)
def __init__(self, env): self.env = env self.input_size = env.observation_space.shape[0] self.output_size = env.action_space.n self.gamma = 0.9 self.num_episodes = 5000 self.replay_buffer = deque() self.MEMORY_SIZE = 50000 self.model = models.DQN(self.input_size, self.output_size, [10]) self.target_Q = models.DQN(self.input_size, self.output_size, [10]) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
def __init__(self, actions, params={}): self.params = params self.model = models.ForwardModel(params).cuda() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=params['lr']) if 'horizon' in params.keys(): self.horizon = params['horizon'] self.num_actions = actions self.traj = [] self.replay_buffer = { 'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'levels': [], 'next_levels': [] } self.current_level = 0 self.started_training = False self.action_buffer = [] self.mode = 'explore' self.ucb_c = params['ucb_c'] self.dqn = models.DQN(params).cuda() self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters(), lr=0.0001) print("[NeuraE3] Initialized")
def main(): config, experiment_name = arguments.get_args() # Set seed random.seed(config.seed) numpy.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True experiment = f'{config.results_dir}/{experiment_name}/' print("EXPERIMENT NAME: ", experiment_name) # Create the experiment folder and logger if not os.path.exists(experiment): os.makedirs(experiment) logger = utils.SimpleLogger(f'{experiment}/log.txt') # Copy source code srcpath = experiment + '/src/' if not os.path.exists(srcpath): os.makedirs(srcpath) os.system(f'cp *.py {srcpath}') # Define log settings log_path = experiment + '/train_baseline.log' # Create agent and environment env = environment.EnvironmentWrapper(config) agent = agents.Agent(config) if config.cuda == 1: agent = agent.cuda() optimizer = optim.Adam(agent.parameters(), lr=config.learning_rate) agent.dqn = models.DQN(config).cuda() optimizer_dqn = optim.Adam(agent.dqn.parameters(), lr=config.dqn_learning_rate) agent.best_dqn_params = agent.dqn.state_dict() keep_training_dqn = True print(f'# parameters: {utils.count_parameters(agent)}') # Load checkpoint if one exists # if os.path.isfile(experiment + '/agent.pth'): # print(f'[loading checkpoint from {experiment}]') # checkpoint = torch.load(experiment + '/agent.pth') # agent.load_state_dict(checkpoint['agent'].state_dict()) # optimizer.load_state_dict(checkpoint['optimizer'].state_dict()) # agent.replay_memory = checkpoint['agent'].replay_memory # epoch = checkpoint['ep'] + 1 # perf = torch.load(experiment + '/perf.pth') # print(f'[resuming at epoch {epoch}]') # else: # epoch = 0 # perf = {'losses': [], 'metrics': [], 'rewards': []} epoch = 0 perf = {'losses': [], 'metrics': [], 'rewards': []} tensorboard = Tensorboard( config.results_dir + f'/tensorboard/{experiment_name}', log_dir=config.results_dir + '/tensorboard_logs/') best_exploit_perf = -math.inf dqn_epochs_completed = 0 # Start algorithm phase = 'explore' while epoch < 200: if epoch < config.n_exploration_epochs: #### Explore phase phase = 'explore' agent.eval() exploration_policy = 'random' if epoch == 0 else config.exploration_policy ep_reward, ep_length = agent.act(env, 'train', config, policy=exploration_policy, goal='explore') logger.log( f'EXPLORE PHASE | mean reward: {ep_reward}, mean episode length: {ep_length}' ) if config.test == 1: agent.act(env, 'test', config, policy=exploration_policy, goal='explore') # train the models for i in range(config.n_training_epochs): if (i < config.n_training_epochs - 1): split = 'train' agent.train() else: split = 'test' agent.eval() losses, log_string = agent.train_model( split, 'explore', optimizer, config, tensorboard, update=(split == 'train')) logger.log(f'TRAINING MODEL | ep {epoch}/{i} | {log_string}') else: #### Exploit phase if 'maze' in config.env: # just do search ep_reward, ep_length = agent.act( env, 'train', config, policy='particle2', goal='exploit', n_episodes=config.n_trajectories) logger.log( f'EXPLOIT PHASE: epoch {epoch}, mean reward: {ep_reward}, mean episode length: {ep_length}' ) else: if phase == 'explore': # this is our first time exploiting - train the DQN for a while phase = 'exploit' agent.train_policy_dqn('train', 'explore', optimizer_dqn, config, n_updates=config.dqn_model_updates, logger=logger) agent.best_dqn_params = copy.deepcopy( agent.dqn.state_dict()) keep_training_dqn = True else: # act in the environment if keep_training_dqn: agent.train_policy_dqn('train', 'explore', optimizer_dqn, config, n_updates=25000, logger=logger) ep_reward, ep_length = agent.act( env, 'test', config, policy='dqn', n_episodes=config.dqn_eval_ep) logger.log( f'EXPLOIT PHASE: epoch {epoch}, mean reward: {ep_reward}, mean episode length: {ep_length}, DQN training: {keep_training_dqn}' ) if keep_training_dqn: if ep_reward >= best_exploit_perf or config.checkpoint_dqn == 0: best_exploit_perf = ep_reward agent.best_dqn_params = copy.deepcopy( agent.dqn.state_dict()) # TODO clone!!!! else: agent.dqn.load_state_dict(agent.best_dqn_params) keep_training_dqn = False perf['epoch'] = epoch perf['rewards'].append(ep_reward) torch.save(perf, f'{experiment}/perf.pth') torch.save({ 'agent': agent, 'optimizer': optimizer, 'ep': epoch }, f'{experiment}/agent.epoch{epoch}.pth') torch.save({ 'agent': agent, 'optimizer': optimizer, 'ep': epoch }, f'{experiment}/agent.pth') torch.save(agent.replay_memory, f'{experiment}/replay_memory.pth') epoch += 1
if render: env.render() state_t = torch.tensor(state) if type(env.action_space) == gym.spaces.Discrete: action_values = network(state_t.unsqueeze(0)) action = torch.argmax(action_values) else: mu,_ = network(state_t) action = mu.cpu().numpy() state, reward, done, _ = env.step(action) total_reward += reward env.close() return total_reward if __name__=="__main__": net_state_file_dir = os.getcwd() env = preproccessing.make_env(env_name) print(net_state_file_dir) policy = models.DQN(env.observation_space.shape, env.action_space.n) print(f"environment name: {env_name} | Solving reward: { env.spec.reward_threshold}| Episode length: {env.spec.max_episode_steps}") while True: policy.load_state_dict(torch.load(net_state_file_dir + f'/DQN-{env_name}_network_state.pt')) total_return = test_policy(policy, env, render=render, record=record) print(f"Total return {total_return}") print("Press Enter to play again") input('')
def prepare(): if not os.path.exists(CONST.LOG_PATH): os.mkdir(CONST.LOG_PATH) if not os.path.exists(CONST.LOG_SYSBENCH_PATH): os.mkdir(CONST.LOG_SYSBENCH_PATH) global opt, task_detail, instance_detail, model_detail parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=2, help='Training Batch Size') parser.add_argument('--memory', type=str, default='', help='add replay memory') parser.add_argument('--task_id', type=int, required=True, help='get task info') parser.add_argument('--inst_id', type=int, required=True, help='get inst info') parser.add_argument('--model_id', type=int, required=True, help='get model info') parser.add_argument('--host', type=str, required=True, help='cluster host for set mysql param') opt = parser.parse_args() task_id = opt.task_id inst_id = opt.inst_id model_id = opt.model_id init_logger(task_id, False, True) CONST.TASK_ID = task_id logger.info("start cdbtune") logger.info(opt) # taskdb = database("127.0.0.1",3306,"root","123456","cdbtune") taskdb = database("10.249.50.200", 4839, "cdbtune", "123456", "cdbtune") rsp_task = taskdb.fetch_all("select * from tb_task where task_id = %d" % task_id) rsp_inst = taskdb.fetch_all( "select * from tb_mysql_inst where inst_id = %d" % inst_id) rsp_model = taskdb.fetch_all( "select * from tb_models where model_id = %d" % model_id) if len(rsp_task) == 0 or len(rsp_inst) == 0 or len(rsp_model) == 0: os_quit(Err.INPUT_ERROR, "task_id or inst_id or model_id doesn`t exit") task_detail = rsp_task[0] instance_detail = rsp_inst[0] model_detail = rsp_model[0] method = model_detail["method"] model_path = model_detail["position"] num_knobs = model_detail["knobs"] num_metrics = model_detail["dimension"] env = environment.TencentServer(instance=instance_detail, task_detail=task_detail, model_detail=model_detail, host=opt.host) # Build models if method == 'ddpg': ddpg_opt = dict() ddpg_opt['tau'] = 0.001 ddpg_opt['alr'] = 0.00001 ddpg_opt['clr'] = 0.00001 ddpg_opt['model'] = model_path gamma = 0.99 memory_size = 100000 ddpg_opt['gamma'] = gamma ddpg_opt['batch_size'] = opt.batch_size ddpg_opt['memory_size'] = memory_size model = models.DDPG(n_states=num_metrics, n_actions=num_knobs, opt=ddpg_opt, ouprocess=True) else: model = models.DQN() pass if len(opt.memory) > 0: model.replay_memory.load_memory(opt.memory) logger.info("Load Memory: {}".format(len(model.replay_memory))) # Load mean value and varianc current_knob = environment.get_init_knobs() return env, model
env.reset() plt.figure() BATCH_SIZE = 128 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 IMAGE_SIZE = 3 * 32 * 48 tb = SummaryWriterWithGlobal('cartpole') policy_net = models.DQN().to(device) target_net = models.DQN().to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() vae = models.ThreeLayerLinearVAE(IMAGE_SIZE, 10).to(device) vae_optim = optim.Adam(vae.parameters(), lr=1e-3) optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) eye = Eyes(env, vae) #eye.registerObserver('raw', OpenCV('raw')) eye.registerObserver('raw', OpenCV('raw')) eye.registerObserver('recon', OpenCV('recon')) eye.registerObserver('raw', ImageFileWriter('data/images/fullscreen', 'raw'))
ddpg_opt['model'] = opt.params n_states = opt.metric_num gamma = 0.99 memory_size = 100000 num_actions = opt.default_knobs + opt.other_knob ddpg_opt['gamma'] = gamma ddpg_opt['batch_size'] = opt.batch_size ddpg_opt['memory_size'] = memory_size model = models.DDPG(n_states=n_states, n_actions=num_actions, opt=ddpg_opt, ouprocess=not opt.noisy) else: model = models.DQN() pass if not os.path.exists('log'): os.mkdir('log') if not os.path.exists('save_memory'): os.mkdir('save_memory') if not os.path.exists('save_knobs'): os.mkdir('save_knobs') if not os.path.exists('save_state_actions'): os.mkdir('save_state_actions') if not os.path.exists('model_params'):
def _create_agents(config_list): """ Create agents with different hyper-parameters. Parameters ---------- config_list : list of dict List of parameters dict. Each dict has configurations such as model name, learning rate, etc.. Returns ------- Created agents list and core agent object. """ try: agents = [] for config in config_list: hyper_parameters = utils.Hyperparameter( batch_size=config["batch_size"], gamma=config["gamma"], eps_start=config["eps_start"], eps_end=config["eps_end"], eps_decay=config["eps_decay"], target_update=config["target_update"], default_durability=config["default_durability"], learning_rate=config["learning_rate"], initial_memory=config["initial_memory"], n_episode=config["n_episode"], n_actions=config["n_action"], default_durability_decreased_level=config[ "default_durability_decreased_level"], default_durability_increased_level=config[ "default_durability_increased_level"], default_check_frequency=config["default_check_frequency"], default_healing_frequency=config["default_healing_frequency"], env_name=config["env_name"], exp_name=config["exp_name"], render=config["render"], run_name=config["run_name"], output_directory_path=config["output_directory_path"], hyper_dash=config["hyper_dash"], model_saving_frequency=config["model_saving_frequency"], parameters_name=config["name"], roulette_mode=config["roulette_mode"], max_reward=config["max_reward"], min_reward=config["min_reward"]) print(config["name"]) if config["name"] != "core": if config["model"] == "DQN": policy_net = models.DQN(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.DQN(n_actions=4).to( hyper_parameters.DEVICE) elif config["model"] == "DDQN": policy_net = models.DDQN(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.DDQN(n_actions=4).to( hyper_parameters.DEVICE) elif config["model"] == "DQNbn": policy_net = models.DQNbn(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.DQNbn(n_actions=4).to( hyper_parameters.DEVICE) elif config["model"] == "NonBatchNormalizedDQN": policy_net = models.NonBatchNormalizedDQN(n_actions=4).to() target_net = models.NonBatchNormalizedDQN(n_actions=4).to( hyper_parameters.DEVICE) # elif args["model"] == "RamDQN": # policy_net = models.RamDQN(n_actions=4).to(hyper_parameters.DEVICE) # target_net = models.RamDQN(n_actions=4).to(hyper_parameters.DEVICE) else: policy_net = models.DQN(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.DQN(n_actions=4).to( hyper_parameters.DEVICE) optimizer = optim.Adam(policy_net.parameters(), lr=hyper_parameters.LEARNING_RATE) agents.append( Agent(policy_net, target_net, hyper_parameters.DEFAULT_DURABILITY, optimizer, config["name"], hyper_parameters)) else: # For core agent policy_net = models.NonBatchNormalizedDQN(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.NonBatchNormalizedDQN(n_actions=4).to( hyper_parameters.DEVICE) optimizer = optim.Adam(policy_net.parameters(), lr=hyper_parameters.LEARNING_RATE) core_agent = Agent(policy_net, target_net, hyper_parameters.DEFAULT_DURABILITY, optimizer, config["name"], hyper_parameters) print("Agent:{} has been done".format(config["name"])) try: core_agent except Exception as e: print("P_RuntimeError:0x1000 Core agent has not been defined.") tb = sys.exc_info()[2] print(e.with_traceback(tb)) sys.exit(1) return agents, core_agent except Exception as e: print("P_RuntimeError:0x1001 Some arguments is missing.") tb = sys.exc_info()[2] print(e.with_traceback(tb)) sys.exit(1)
################################################################## BATCH_SIZE = 32 REPLAY_MEMORY_SIZE = 100000 DOUBLE_DQN = True EPSILON = 1.0 GAMMA = 0.99 ANNEAL_TO = 0.02 ANNEAL_OVER = 100000 # time steps ANNEAL_STEP = (EPSILON - ANNEAL_TO) / ANNEAL_OVER NUM_EPISODES = 10000000 NUM_WARMSTART = 35 # episodes MAX_NOOP_ITERS = 30 TARGET_UPDATE = 1000 # time steps PROGRESS_INTERVAL = 10 # episodes policy_net = models.DQN(FRAME_HISTORY_SIZE, NUM_ACTIONS).to(device) target_net = models.DQN(FRAME_HISTORY_SIZE, NUM_ACTIONS).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() #optimizer = optim.RMSprop(policy_net.parameters(), lr=0.00025, momentum=0.95, eps=0.01) #optimizer = optim.RMSprop(policy_net.parameters(), lr=0.00005) #optimizer = optim.Adam(policy_net.parameters(), lr=0.0001) #7 optimizer = optim.RMSprop(policy_net.parameters(), lr=0.0025, alpha=0.9, eps=1e-02, momentum=0.0) #8 #optimizer = optim.RMSprop(policy_net.parameters(), lr=0.00025, alpha=0.95, eps=0.01, momentum=0.0) #9 memory = ReplayMemory(REPLAY_MEMORY_SIZE) # Select an action randomly without annealing EPSILON def select_random_action(): return torch.tensor([[random.randrange(NUM_ACTIONS)]], device=device, dtype=torch.long)
'-------------------------------DQN----------------------------------' ) print( '----------------------------------------------------------------------------------------' ) print( f"environment name: {env_name} | number of iterations: {iters}" ) print( f"Hyperparameters: learning rate: {lr} | gamma: {gamma} | Epsilon: {eps}" ) print( f"Hyperparameters: Epsilone decay rate: {eps_rate} | Buffer size: {buffer_size} | Batch size: {batch_size}" ) DQN_net = models.DQN(env.observation_space.shape, env.action_space.n) target_net = models.DQN(env.observation_space.shape, env.action_space.n) buffer = ReplayBuffer(buffer_size) epsilone = rlUtils.Epsilon(eps, eps_min, eps_rate) experience = collections.namedtuple( 'Experience', ['obs', 'action', 'reward', 'next_obs', 'done']) print(DQN_net) print( '----------------------------------------------------------------------------------------' ) optimizer = opt.Adam(DQN_net.parameters(), lr=lr)