def learn(gym_id, episodes=1000, batch_size=32, model_path="models/model.h5"): env = gym.make(gym_id) num_states = env.observation_space.shape[0] num_actions = env.action_space.n agent = DQN(create_model(num_states, num_actions)) for e in range(episodes): state = env.reset() state = np.reshape(state, [1, num_states]) total_reward = 0. for steps in range(500): action = agent.act(state) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, agent.state_size]) agent.remember(state, action, reward, next_state, done) total_reward += reward state = next_state if done: print( 'Episode {}/{} done in {} steps, total reward {}: '.format( e + 1, episodes, steps + 1, total_reward)) if total_reward >= 200: agent.save(model_path) return agent break if agent.memory_size > batch_size: agent.train( batch_size ) # train the agent with the experience of the episode env.close() return None
def __init__(self, stateCnt, actionCnt, **kwargs): if 'state_1d' in kwargs: state_1d = kwargs['state_1d'] else: state_1d = False if 'dueling' in kwargs: dueling = kwargs['dueling'] else: dueling = False self.steps = 0 self.epsilon = globalvars.MAX_EPSILON self.stateCnt = stateCnt self.actionCnt = actionCnt self.dqn = DQN(self.stateCnt, self.actionCnt, state_1d=state_1d, dueling=dueling) self.memory = Memory()
def learn(args): grid = args.grids[0] rico = Ricochet() rico.grid.load_grid(grid) app = Application(board=rico.grid, show_grid=True) if args.deep: model = DQN((16, 16), 16, verbose=True) else: model = Qlearn() if args.input: model.load_model(args.input) _thread.start_new_thread( _model_act, (app, model, *args.grids), { "output_path": (args.output if args.output else None), "learning": 1 }) app.mainloop()
def play(args): grid = args.grids[0] rico = Ricochet() rico.grid.load_grid(grid) app = Application(board=rico.grid, show_grid=True) if args.deep: model = DQN((16, 16), 16, exploration_rate=0, exploration_decay=0, exploration_min=0) else: model = Qlearn(exploration_rate=0, exploration_decay=0, exploration_min=0) model.load_model(args.model) _thread.start_new_thread(_model_act, (app, model, *args.grids), { "learning": False, "nb_episode": 1, "nb_step": 0, "max_moves": 500 }) app.mainloop()
sys.stdout.write("\r" + text) sys.stdout.flush() if __name__ == "__main__": seed = 1364 total_episodes = 5001 reward_curve_display_frequency = 100 save_model_frequency = 100 learning_rate = 0.001 epsilon_decay = 0.0001 gradient_clipping_norm = 0.7 # Instantiate RL objects env = CartPoleV0(seed=seed) explorer = ActionExplorer(epsilon_decay=epsilon_decay, seed=seed) agent = DQN(env.input_dim, env.num_actions, explorer=explorer, gradient_clipping_norm=gradient_clipping_norm, learning_rate=learning_rate, double_dqn=True, seed=seed) # Run training train(env, agent, total_episodes=total_episodes, reward_curve_display_frequency=reward_curve_display_frequency, save_model_frequency=save_model_frequency)
model.add(Flatten()) model.add(Dense(512, activation='relu')) model.add(Dense(action_size)) # Learning rate and Optimizer (Must be TF!) LEARNING_RATE = 1e-3 tf_optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE) # Create the policy, this is Epsilon-Greedy (exponential decay) policy = EGP(init_eps=0.95, min_epsilon=0.01, decay=0.003) # Can create an object for Prioritized Experience Replay per = PER(priority_importance=0.6, initial_anneal=0.5, anneal_growth_rate=0.00008) # Make the agent! In this case a Double DQN with PER # Can make it dueling and auto add streams, or like the commented model above just dueling and set add streams to false agent = DQN(double_dqn=True, PER=per, dueling_dqn=False, add_dueling_streams=False, model=model, optimizer=tf_optimizer, policy=policy, action_size=action_size, state_processor=state_processor, gamma=0.95, target_model_update_policy='soft', target_model_hard_policy_wait=500, target_model_soft_policy_constant=0.9, replay_period_wait=4, reward_clipping=True, huber_loss=True, batch_size=64, max_memory_length=10000) # Make callbacks if you want, reward and epsilon are implemented rew_cb = PrintReward() eps_cb = PrintEpsilon(episodic=True, iterations=None) # Make a benchmark if you want to keep track of info and data on the agents testing performance benchmark = Benchmark('bench_0', episode_iteration=1) agent.train(env, 100000, None, print_rew_cb=rew_cb, print_eps_cb=eps_cb, visualize=False, allow_printing=True) agent.test(env, 50000, None, print_rew_cb=rew_cb, benchmark=benchmark, visualize=False, allow_printing=True)
def run_lrm(env_params, lp, rl): """ This code learns a reward machine from experience and uses dqn to learn an optimal policy for that RM: - 'env_params' is the environment parameters - 'lp' is the set of learning parameters Returns the training rewards """ # Initializing parameters and the game env = Game(env_params) rm = RewardMachine(lp.rm_u_max, lp.rm_preprocess, lp.rm_tabu_size, lp.rm_workers, lp.rm_lr_steps, env.get_perfect_rm(), lp.use_perfect_rm) actions = env.get_actions() policy = None train_rewards = [] rm_scores = [] reward_total = 0 last_reward = 0 step = 0 # Collecting random traces for learning the reward machine print("Collecting random traces...") while step < lp.rm_init_steps: # running an episode using a random policy env.restart() trace = [(env.get_events(), 0.0)] for _ in range(lp.episode_horizon): # executing a random action a = random.choice(actions) reward, done = env.execute_action(a) o2_events = env.get_events() reward_total += reward trace.append((o2_events, reward)) step += 1 # Testing if step % lp.test_freq == 0: print("Step: %d\tTrain: %0.1f" % (step, reward_total - last_reward)) train_rewards.append((step, reward_total - last_reward)) last_reward = reward_total # checking if the episode finishes if done or lp.rm_init_steps <= step: if done: rm.add_terminal_observations(o2_events) break # adding this trace to the set of traces that we use to learn the rm rm.add_trace(trace) # Learning the reward machine using the collected traces print("Learning a reward machines...") _, info = rm.learn_the_reward_machine() rm_scores.append((step, ) + info) # Start learning a policy for the current rm finish_learning = False while step < lp.train_steps and not finish_learning: env.restart() o1_events = env.get_events() o1_features = env.get_features() u1 = rm.get_initial_state() trace = [(o1_events, 0.0)] add_trace = False for _ in range(lp.episode_horizon): # reinitializing the policy if the rm changed if policy is None: print("Learning a policy for the current RM...") if rl == "dqn": policy = DQN(lp, len(o1_features), len(actions), rm) elif rl == "qrm": policy = QRM(lp, len(o1_features), len(actions), rm) else: assert False, "RL approach is not supported yet" # selecting an action using epsilon greedy a = policy.get_best_action(o1_features, u1, lp.epsilon) # executing a random action reward, done = env.execute_action(a) o2_events = env.get_events() o2_features = env.get_features() u2 = rm.get_next_state(u1, o2_events) # updating the number of steps and total reward trace.append((o2_events, reward)) reward_total += reward step += 1 # updating the current RM if needed rm.update_rewards(u1, o2_events, reward) if done: rm.add_terminal_observations(o2_events) if rm.is_observation_impossible(u1, o1_events, o2_events): # if o2 is impossible according to the current RM, # then the RM has a bug and must be relearned add_trace = True # Saving this transition policy.add_experience(o1_events, o1_features, u1, a, reward, o2_events, o2_features, u2, float(done)) # Learning and updating the target networks (if needed) policy.learn_if_needed() # Testing if step % lp.test_freq == 0: print("Step: %d\tTrain: %0.1f" % (step, reward_total - last_reward)) train_rewards.append((step, reward_total - last_reward)) last_reward = reward_total # finishing the experiment if the max number of learning steps was reached if policy._get_step() > lp.max_learning_steps: finish_learning = True # checking if the episode finishes or the agent reaches the maximum number of training steps if done or lp.train_steps <= step or finish_learning: break # Moving to the next state o1_events, o1_features, u1 = o2_events, o2_features, u2 # If the trace isn't correctly predicted by the reward machine, # we add the trace and relearn the machine if add_trace and step < lp.train_steps and not finish_learning: print("Relearning the reward machine...") rm.add_trace(trace) same_rm, info = rm.learn_the_reward_machine() rm_scores.append((step, ) + info) if not same_rm: # if the RM changed, we have to relearn all the q-values... policy.close() policy = None else: print("the new RM is not better than the current RM!!") #input() if policy is not None: policy.close() policy = None # return the trainig rewards return train_rewards, rm_scores, rm.get_info()
'dense_layers': [30, 15, env.num_actions], 'activation': 'relu', # 'dense_bn': True }, # 'gradient_clipping_norm': 0.7, 'reward_to_go': True, 'set_device': 'cpu', 'learning_rate': 0.1, 'seed': seed }, 'total_episodes': 10001 } } agents = { 'dqn': DQN(**parameters['dqn']['parameters']), 'vpg': VanillaPolicyGradient(**parameters['vpg']['parameters']) } for agent_name in agents.keys(): agent = agents[agent_name] # Run training train( env, agent, total_episodes=parameters[agent_name]['total_episodes'], rolling_window_size=rolling_window_size, reward_curve_display_frequency=reward_curve_display_frequency, save_model_frequency=save_model_frequency )
def main(args): # gpus if args.gpus is not None: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus # setup environment env = ContinuousPuckStack(args.num_blocks, args.num_pucks, args.num_pucks, args.num_actions, height_noise_std=args.height_noise_std, random_shape=args.random_shape, action_failure_prob=args.action_failure_prob) env.initStride = args.init_env_stride # stride for initial puck placement env.stride = args.env_stride # stride for action specification # setup the agent state_shape = (args.num_blocks * 28, args.num_blocks * 28, 1) output_shape = (args.num_actions, args.num_actions) agent = DQN(env, state_shape, output_shape, args.num_filters, args.filter_sizes, args.strides, args.hiddens, args.learning_rate, args.batch_size, constants.OPT_MOMENTUM, args.exploration_fraction, 1.0, args.final_epsilon, args.max_time_steps, buffer_size=args.buffer_size, prioritized_replay=not args.disable_prioritized_replay, target_net=not args.disable_target_network, target_update_freq=args.target_update_freq, target_size=args.target_size, fix_dones=args.fix_dones) agent.start_session(args.num_cpu, args.gpu_memory_fraction) # maybe load weights if args.load_weights: agent.load(args.load_weights) print("Loaded weights.") # initialize a solver transitions = [] collect_pre, collect_post = collect_factory_bisim(args.num_pucks) collect_data = collect_data_factory_bisim(transitions, args.save_exp_num) t_collect_pre, t_collect_post, t_collect_data = None, None, None if not args.save_exp_after_training: t_collect_pre, t_collect_post, t_collect_data = collect_pre, collect_post, collect_data solver = Solver(env, agent, args.max_time_steps, learning_start=LEARNING_STARTS, train_freq=TRAIN_FREQ, max_episodes=args.max_episodes, rewards_file=args.rewards_file, animate=args.animate, animate_from=args.animate_from, gif_save_path=args.save_gifs_path, gif_save_limit=args.save_limit, gif_save_only_successful=args.save_only_successful, max_depth_value=args.num_pucks, collect_pre=t_collect_pre, collect_post=t_collect_post, collect_data=t_collect_data) # solve the environment solver.run() # save the weights of the network if args.save_weights is not None: agent.save(args.save_weights) # maybe run trained DQN if args.save_exp_after_training: agent.exploration_fraction = 1.0 agent.init_explore = args.save_exp_eps agent.final_explore = args.save_exp_eps agent.setup_exploration_() solver = Solver(env, agent, args.save_exp_num, learning_start=args.save_exp_num, train_freq=TRAIN_FREQ, train=False, max_episodes=args.save_exp_num * 100, collect_pre=collect_pre, collect_post=collect_post, collect_data=collect_data) solver.run() # maybe save the collected experience if args.save_exp_path is not None: if args.save_q_values: set_q_values_for_transitions(transitions, agent, args.batch_size) save_dir = os.path.dirname(args.save_exp_path) if len(save_dir) > 0 and not os.path.isdir(save_dir): os.makedirs(save_dir) with open(args.save_exp_path, "wb") as file: pickle.dump(transitions, file) # stop session agent.stop_session()
num_actions=env.available_actions, policy=policy, test_policy=policy, processor=processor) else: # Setup DQN agent if opt.recurrent: model = DRQN_Model(window_length=opt.dqn_window_length, num_actions=env.available_actions) else: model = DQN_Model(window_length=opt.dqn_window_length, num_actions=env.available_actions) # Setup DQN agent agent = DQN(model=model, num_actions=env.available_actions, policy=policy, test_policy=policy, processor=processor) else: assert not opt.recurrent # Setup random process for exploration random_process = [ GaussianWhiteNoiseProcess(sigma=0.0, mu=1.0), GaussianWhiteNoiseProcess(sigma=1.0, mu=0.0) ] # Setup DDPG agent model actor, critic, action_input = DDPG_Model( window_length=opt.ddpg_window_length, num_actions=env.available_actions) # Setup DDPG agent agent = DDPG(actor=actor,
class Agent: def __init__(self, stateCnt, actionCnt, **kwargs): if 'state_1d' in kwargs: state_1d = kwargs['state_1d'] else: state_1d = False if 'dueling' in kwargs: dueling = kwargs['dueling'] else: dueling = False self.steps = 0 self.epsilon = globalvars.MAX_EPSILON self.stateCnt = stateCnt self.actionCnt = actionCnt self.dqn = DQN(self.stateCnt, self.actionCnt, state_1d=state_1d, dueling=dueling) self.memory = Memory() def acts(self, s): if random.random() < self.epsilon: return random.randint(0, self.actionCnt - 1) else: return np.argmax(self.dqn.predictOne(s)) def observe(self, sample): if self.steps <= globalvars.REPLAY_START_SIZE: error = abs(sample[2]) self.memory.add(error, sample) else: x, y, a, errors = self._getTargets([(0, sample)]) self.memory.add(errors[0], sample) if self.steps % globalvars.SYNC_TARGET == 0: self.dqn.update_target_model() # Epsilon decay self.epsilon = globalvars.MIN_EPSILON + \ (globalvars.MAX_EPSILON - globalvars.MIN_EPSILON) * math.exp(-globalvars.LAMBDA * (self.steps \ - globalvars.REPLAY_START_SIZE)) self.steps += 1 def _getTargets(self, batch): states = np.array([o[1][0] for o in batch]) if len(self.stateCnt) > 1: no_state = np.zeros(self.stateCnt) else: no_state = np.zeros(self.stateCnt[0]) states_ = np.array([(no_state if o[1][3] is None else o[1][3]) \ for o in batch]) p = self.dqn.predict(states) p_ = self.dqn.predict(states_, target=False) pTarget_ = self.dqn.predict(states_, target=True) if len(self.stateCnt) > 1: x = np.zeros(states.shape) else: x = np.zeros((len(batch), self.stateCnt[0])) y = np.zeros((len(batch), self.actionCnt)) errors = np.zeros(len(batch)) actions = [] for i in range(len(batch)): o = batch[i][1] s = o[0] a = o[1] r = o[2] s_ = o[3] t = p[i] oldVal = t[a] if s_ is None: t[a] = r else: t[a] = r + globalvars.GAMMA * pTarget_[i][np.argmax(p_[i])] x[i] = s y[i] = t actions.append(a) errors[i] = abs(oldVal - t[a]) return x, y, a, errors def replay(self): batch = self.memory.sample(globalvars.BATCH_SIZE) x, y, a, errors = self._getTargets(batch) for i in range(len(batch)): idx = batch[i][0] self.memory.update(idx, errors[i]) self.dqn.train(x, y) def save(self, name): self.dqn.save(name) print('Saved model to ', name) def load(self, name): self.dqn.load(name) print('Loaded model from ', name)
parser.add_argument('-n', '--normalize', help='Normalize inputs', action='store_true') args = parser.parse_args() if not args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Build environment env = Environment(args.env, args.render, args.normalize) # Load config with open( pkg_resources.resource_filename( __name__, f'../config/{args.agent.lower()}.yaml')) as file: config = yaml.load(file, Loader=yaml.FullLoader) config = config[args.env] # Build model model = None if args.agent == 'DQN': model = DQN(env, config) elif args.agent == 'A2C': model = A2C(env, config) elif args.agent == 'PPO': model = PPO(env, config) # Train model model.train()
num_actions=env.action_space.n, policy=policy, test_policy=policy, processor=processor) else: # Setup DQN agent if opt.recurrent: model = DRQN_Model(window_length=opt.dqn_window_length, num_actions=env.action_space.n) else: model = DQN_Model(window_length=opt.dqn_window_length, num_actions=env.action_space.n) # Setup DQN agent agent = DQN(model=model, num_actions=env.action_space.n, policy=policy, test_policy=policy, processor=processor) else: agent = RandomAgent(num_actions=env.action_space.n, processor=processor) print(args.env_name + ' initialized.') # Setup weights path path = os.path.join('weights', 'Atari', '{}'.format(args.env_name)) if not os.path.exists(path): os.makedirs(path) weights_path = os.path.join(path, 'weights.hdf5') # Run the agent agent.fit(env=env,
# parser.add_argument('--alpha', help='weigth for intrinsic reward and external reward') # parser.add_argument('--term', help='termination factor') args = parser.parse_args() # initialization rl elements env = gym.make('CartPole-v0') env = env.unwrapped sess = tf.Session() # seed setting np.random.seed(args.seed) tf.set_random_seed(args.seed) # info about cartpole task # print env.action_space # print env.observation_space # print env.observation_space.high # bound for state # print env.observation_space.low buffer = ReplayBuffer(args.capacity, args.batch, args.seed) agent = DQN( sess, env.observation_space.shape[0], env.action_space.n, buffer=buffer) example = DQNDemo(agent, env, max_episode=args.episodes) example.run()
import unittest import gym import os import sys module_path = os.path.abspath(os.path.join('..')) if module_path not in sys.path: sys.path.append(module_path) from collections import deque import numpy as np from agents.dqn import DQN env = gym.make('CartPole-v0') agent = DQN(env) class UnitTests(unittest.TestCase): ''' Unittest suite ''' def test_build_model(self): ''' Unittest for _build_model function Check if the model has input and output layers that match the observation space and action space specifically for cartpole problem ''' random_model = agent._build_model() assert agent._build_model().input_shape[1] == 4, "The model is not compatible for cartpole with observation space equal to 4"
def train(): # build SFDQN print('building SFDQN') deep_sf = DeepSF(keras_model_handle=sf_model_lambda, **sfdqn_params) sfdqn = SFDQN(deep_sf=deep_sf, buffer=ReplayBuffer(sfdqn_params['buffer_params']), **sfdqn_params, **agent_params) # train SFDQN print('training SFDQN') train_tasks, test_tasks = generate_tasks(False) sfdqn_perf = sfdqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev']) # build DQN print('building DQN') dqn = DQN(model_lambda=dqn_model_lambda, buffer=ReplayBuffer(dqn_params['buffer_params']), **dqn_params, **agent_params) # training DQN print('training DQN') train_tasks, test_tasks = generate_tasks(True) dqn_perf = dqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev']) # smooth data def smooth(y, box_pts): return np.convolve(y, np.ones(box_pts) / box_pts, mode='same') sfdqn_perf = smooth(sfdqn_perf, 10)[:-5] dqn_perf = smooth(dqn_perf, 10)[:-5] x = np.linspace(0, 4, sfdqn_perf.size) # reporting progress ticksize = 14 textsize = 18 plt.rc('font', size=textsize) # controls default text sizes plt.rc('axes', titlesize=textsize) # fontsize of the axes title plt.rc('axes', labelsize=textsize) # fontsize of the x and y labels plt.rc('xtick', labelsize=ticksize) # fontsize of the tick labels plt.rc('ytick', labelsize=ticksize) # fontsize of the tick labels plt.rc('legend', fontsize=ticksize) # legend fontsize plt.figure(figsize=(8, 6)) ax = plt.gca() ax.plot(x, sfdqn_perf, label='SFDQN') ax.plot(x, dqn_perf, label='DQN') plt.xlabel('training task index') plt.ylabel('averaged test episode reward') plt.title('Testing Reward Averaged over all Test Tasks') plt.tight_layout() plt.legend(frameon=False) plt.savefig('figures/sfdqn_return.png')
def run_baseline(env_params, lp, rl, k_order): """ This code learns a reward machine from experience and uses dqn to learn an optimal policy for that RM: - 'env_params' is the environment parameters - 'lp' is the set of learning parameters Returns the training rewards """ # Initializing parameters and the game env = Game(env_params) actions = env.get_actions() policy = None train_rewards = [] reward_total = 0 last_reward = 0 step = 0 # Start learning a policy for the current rm while step < lp.train_steps: env.restart() o1_events = env.get_events() o1_features = env.get_features() # computing the stack of features for o1 k_prev_obs = [np.zeros(len(o1_features)) for _ in range(k_order-1)] # saves the k-previous observations k_prev_obs.insert(0, o1_features) o1_stack = np.concatenate(tuple(k_prev_obs), axis=None) for _ in range(lp.episode_horizon): # reinitializing the policy if the rm changed if policy is None: if rl == "dqn": policy = DQN(lp, k_order * len(o1_features), len(actions), None) elif rl == "human": policy = None else: assert False, "RL approach is not supported yet" # selecting an action using epsilon greedy if rl == "human": if random.random() < 0.1: a = random.randrange(4) else: a = env.get_optimal_action().value else: a = policy.get_best_action(o1_stack, 0, lp.epsilon) # executing a random action reward, done = env.execute_action(a) o2_events = env.get_events() o2_features = env.get_features() # Appending the new observation and computing the stack of features for o2 k_prev_obs.insert(0, o2_features) k_prev_obs.pop() o2_stack = np.concatenate(tuple(k_prev_obs), axis=None) # updating the number of steps and total reward reward_total += reward step += 1 if rl != "human": # Saving this transition policy.add_experience(o1_events, o1_stack, 0, a, reward, o2_events, o2_stack, 0, float(done)) # Learning and updating the target networks (if needed) policy.learn_if_needed() # Testing if step % lp.test_freq == 0: print("Step: %d\tTrain: %0.1f"%(step, reward_total - last_reward)) train_rewards.append((step, reward_total - last_reward)) last_reward = reward_total # checking if the episode finishes if done or lp.train_steps <= step: break # Moving to the next state o1_events, o1_features, o1_stack = o2_events, o2_features, o2_stack # closing the policy if policy is not None: policy.close() policy = None # return the trainig rewards return train_rewards