def run_a3c_thread(baseDir, runNo, args, max_episode, test_only): init_nn_library(True, args['gpu']) if not test_only: stats = run_a3c(**args) #with open(baseDir + '/test-' + str(runNo) + '/final_stats.pkl', 'w') as f: # pickle.dump(stats, f) run_test('dqn', args, baseDir, runNo, max_episode)
def run_td_realtime(**kargs): if kargs['output_dir'] is None and kargs['logdir'] is not None: kargs['output_dir'] = kargs['logdir'] from collections import namedtuple args = namedtuple("TDRealtimeParams", kargs.keys())(*kargs.values()) if 'dont_init_tf' not in kargs.keys() or not kargs['dont_init_tf']: init_nn_library(True, "1") env = get_env(args.game, args.atari, args.env_transforms) envOps = EnvOps(env.observation_space.shape, env.action_space.n, args.learning_rate, mode="train") print(env.observation_space.low) print(env.observation_space.high) env_model = globals()[args.env_model](envOps) if args.env_weightfile is not None: env_model.model.load_weights(args.env_weightfile) v_model = globals()[args.vmodel](envOps) import numpy as np td_model = TDNetwork(env_model.model, v_model, envOps) summary_writer = tf.summary.FileWriter(args.logdir, K.get_session().graph) if not args.logdir is None else None replay_buffer = ReplayBuffer(args.replay_buffer_size, 1, args.update_frequency, args.replay_start_size, args.batch_size) from utils.network_utils import NetworkSaver network_saver = NetworkSaver(args.save_freq, args.logdir, v_model.model) v_agent = VAgent(env.action_space, env_model, v_model, envOps, summary_writer, True, replay_buffer, args.target_network_update) egreedyOps = EGreedyOps() if replay_buffer is not None: egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE egreedyOps.mode = args.mode egreedyOps.test_epsilon = args.test_epsilon #egreedyOps.FINAL_EXPLORATION_FRAME = 10000 if args.mode == "train": egreedyOps.FINAL_EXPLORATION_FRAME = args.egreedy_final_step if args.mode == "train": if args.egreedy_decay<1: egreedyOps.DECAY = args.egreedy_decay egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, v_agent) else: egreedyAgent = MultiEGreedyAgent(env.action_space, egreedyOps, v_agent, args.egreedy_props, args.egreedy_final, final_exp_frame=args.egreedy_final_step) else: egreedyAgent = EGreedyAgent(env.action_space, egreedyOps, v_agent) runner = Runner(env, egreedyAgent, None, 1, max_step=args.max_step, max_episode=args.max_episode) runner.listen(replay_buffer, None) runner.listen(v_agent, None) runner.listen(egreedyAgent, None) runner.listen(network_saver, None) #runner.run() return runner, v_agent
def run_td_realtime_thread(baseDir, runNo, thread_id, args): init_nn_library(True, '0') runner, agent = run_td_realtime(**args) runner.run() with open( baseDir + '/' + str(args['target_network_update']) + '-' + str(runNo) + '.pkl', 'w') as f: pickle.dump(agent.stats, f)
def run_dqn_thread(baseDir, runNo, thread_id, args): init_nn_library(True, '0') runner, agent = run_dqn(**args) runner.run() with open( baseDir + '/' + str(args['learning_rate']) + '-' + str(runNo) + '.pkl', 'w') as f: pickle.dump(agent.stats, f)
args = parser.parse_args() from PIL import Image from envs.gym_env import gym_env from envs.env_transform import WarmUp, ActionRepeat, ObservationStack, EnvTransform from utils.preprocess import * from utils.network_utils import NetworkSaver from runner.runner import Runner from agents.agent import DqnAgent, DqnAgentOps, EGreedyOps, EGreedyAgent, MultiEGreedyAgent, EGreedyAgentExp from utils.memory import ReplayBuffer, NStepBuffer from nets.net import QModel, DQNModel, DqnOps, init_nn_library import tensorflow as tf import keras.backend as K init_nn_library(True, "1") from utils.viewer import EnvViewer class Penalizer(EnvTransform): def __init__(self, env): super(Penalizer, self).__init__(env) def reset(self): ob = self.env.reset() self.score = 0 return ob def step(self, action): ob, reward, done = self.env.step(action)
def run_td_realtime_thread(baseDir, runNo, args, max_episode, test_only): init_nn_library(True, args['gpu']) if not test_only: runner, _ = run_td_realtime(**args) runner.run() run_test('td_realtime', args, baseDir, runNo, max_episode)
def run_td_thread(baseDir, runNo, args, max_episode, test_only): print('*************GPU: *************** ' + args['gpu']) init_nn_library(True, args['gpu']) if not test_only: stats = run_td(**args) run_test('td', args, baseDir, runNo, max_episode)
from runner.runner import Runner, RunnerListener from agents.agent import DqnAgent, DqnAgentOps, EGreedyOps, EGreedyAgent, MultiEGreedyAgent from utils.memory import ReplayBuffer, NStepBuffer from nets.net import A3CModel, QModel, DQNModel, DqnOps, init_nn_library, TabularQModel, CartPoleModel import tensorflow as tf import keras.backend as K import threading from threading import Lock from utils.stopable_thread import StoppableThread if ENABLE_RENDER: from gym.envs.classic_control import rendering viewer = rendering.SimpleImageViewer() init_nn_library(False, "1") if args.game == "Grid": env = GridEnv() else: env = gym_env(args.game) #modelOps = DqnOps(env.action_count) #modelOps.dueling_network = args.dueling_dqn #modelOps.INPUT_SIZE = env.observation_space.n #modelOps.LEARNING_RATE = 0.2 #modelOps.AGENT_HISTORY_LENGTH = 1 #model = TabularQModel(modelOps) from nets.initializers import dqn_uniform from keras.layers import Input, Permute, ZeroPadding2D, Conv2D, Flatten, Dense, Add, Subtract, Lambda
def run_td_test(**kargs): if ('output_dir' not in kargs or kargs['output_dir'] is None) and kargs['logdir'] is not None: kargs['output_dir'] = kargs['logdir'] from collections import namedtuple args = namedtuple("TDTestParams", kargs.keys())(*kargs.values()) if 'dont_init_tf' in kargs and not kargs['dont_init_tf']: init_nn_library(True, "1") #env = gym_env(args.game) print('Monitor dir', kargs['monitor_dir'] if 'monitor_dir' in kargs else None) env = get_env(args.game, args.atari, args.env_transforms, kargs['monitor_dir'] if 'monitor_dir' in kargs else None) viewer = None if args.enable_render: viewer = EnvViewer(env, args.render_step, 'human') envOps = EnvOps(env.observation_space.shape, env.action_space.n, 0) #print(env.observation_space.low) #print(env.observation_space.high) env_model = globals()[args.env_model](envOps) if args.env_weightfile is not None: env_model.model.load_weights(args.env_weightfile) v_model = globals()[args.vmodel](envOps) weight_files = [] if not isinstance(args.load_weightfile, list): weight_files = [(args.load_weightfile, 0)] else: idxs = range(int(args.load_weightfile[1]), int(args.load_weightfile[3]), int(args.load_weightfile[2])) weight_files = [(args.load_weightfile[0] + str(I) + '.h5', I) for I in idxs] summary_writer = tf.summary.FileWriter( args.logdir, K.get_session().graph) if not args.logdir is None else None sw = SummaryWriter(summary_writer, ['Average reward', 'Total reward']) #sw = SummaryWriter(summary_writer, ['Reward']) stats = {'reward': []} for I, weight_file_info in enumerate(weight_files): weight_file = weight_file_info[0] total_step_count = weight_file_info[1] v_model.model.load_weights(weight_file) v_agent = VAgent(env.action_space, env_model, v_model, envOps, None, False) runner = Runner(env, v_agent, None, 1, max_step=args.max_step, max_episode=args.max_episode) runner.listen(v_agent, None) if viewer is not None: runner.listen(viewer, None) runner.run() tmp_stats = np.array(v_agent.stats['reward']) total_reward = tmp_stats[:, 1].sum() total_reward = total_reward / args.max_episode aver_reward = total_reward / tmp_stats[-1, 0] sw.add([aver_reward, total_reward], I) stats['reward'].append((total_step_count, total_reward)) print( '{0} / {1}: Aver Reward per step = {2}, Aver Reward per espisode = {3}' .format(I + 1, len(weight_files), aver_reward, total_reward)) return stats
def run_td(**kargs): debug = False if kargs['output_dir'] is None and kargs['logdir'] is not None: kargs['output_dir'] = kargs['logdir'] from collections import namedtuple args = namedtuple("TDParams", kargs.keys())(*kargs.values()) target_network_update = ParameterDecay(args.target_network_update) if 'dont_init_tf' in kargs and not kargs['dont_init_tf']: init_nn_library(True, "1") env = get_env(args.game, args.atari, args.env_transforms, kargs['monitor_dir'] if 'monitor_dir' in kargs else None) envOps = EnvOps(env.observation_space.shape, env.action_space.n, args.learning_rate) #print(env.observation_space.low) #print(env.observation_space.high) env_model = globals()[args.env_model](envOps) if args.env_weightfile is not None: env_model.model.load_weights(args.env_weightfile) v_model = globals()[args.vmodel](envOps) import numpy as np td_model = TDNetwork( env_model.model, v_model, envOps, False, kargs['derivative_coef'] if 'derivative_coef' in kargs else 0) summary_writer = tf.summary.FileWriter( args.logdir, K.get_session().graph) if not args.logdir is None else None sw = SummaryWriter(summary_writer, ['Loss']) if args.load_trajectory is not None: from utils.trajectory_utils import TrajectoryLoader traj = TrajectoryLoader(args.load_trajectory) from utils.network_utils import NetworkSaver network_saver = NetworkSaver(args.save_freq, args.logdir, v_model.model) import scipy.stats as stats td_exponent = ParameterDecay(kargs['td_exponent'] if 'td_exponent' in kargs and kargs['td_exponent'] is not None else 2) #from tensorflow.keras.utils import plot_model #plot_model(td_model.td_model, to_file='td_model.png') td_model.td_model.summary() print('TDNetwork Layers') for layer_idx, layer in enumerate(td_model.td_model.layers): print('Layer ', layer_idx, layer.name, layer.shape if hasattr(layer, "shape") else layer.input_shape) for I in range(args.max_step): #batch = np.random.uniform([-4.8, -5, -0.48, -5], [4.8, 5, 0.48, 5], size=(args.batch_size,4)) #lower, upper = -1, 1 #mu, sigma = 0.5, 0.4 #X = stats.truncnorm((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma) #samples = np.random.uniform([-1], [1], size=(5000,1)) if hasattr(env_model, "get_samples"): samples = env_model.get_samples(args.sample_count) else: samples = np.random.uniform(args.smin, args.smax, size=(args.sample_count, len(args.smin))) res = td_model.test(samples) if isinstance(res, (list, )): res = res[0] td_errors = res.flatten() props = np.abs(td_errors) #props = np.multiply(props, props) props = np.power(props, td_exponent()) props = props / props.sum() count = 0 if isinstance(samples, list): count = samples[0].shape[0] else: count = samples.shape[0] idxs = np.random.choice(count, args.batch_size, False, props) batch = { #'current': np.random.uniform([-1], [1], size=(args.batch_size,1)) #'current': X.rvs((args.batch_size,1)) 'current': [a[idxs] for a in samples] if isinstance(samples, list) else samples[idxs] } #batch = traj.sample(args.batch_size) if debug: # @ersin - freeway icin test kodu old_loss = td_model.test(batch['current']) print(I, old_loss.flatten()) decoder_input = [ batch['current'][1].astype(np.float32), batch['current'][0].astype(np.float32) ] decoded_output = env_model.model_decoder.predict_on_batch( decoder_input) save_image(decoded_output, args.batch_size, f'{kargs["output_dir"]}/{I}_sample') loss = td_model.train(batch['current']) #@ersin - tests if debug and False: save_image(batch['current'][0], args.batch_size, f'{kargs["output_dir"]}/{I}_current_cars') save_image(batch['current'][1], args.batch_size, f'{kargs["output_dir"]}/{I}_current_tavuks') save_image(batch['current'][2], args.batch_size, f'{kargs["output_dir"]}/{I}_current_cross') save_image(batch['current'][3], args.batch_size, f'{kargs["output_dir"]}/{I}_current_carpisma') next_state = env_model.predict_next(batch['current']) for J in range(3): save_image(next_state[0 + J * 4], args.batch_size, f'{kargs["output_dir"]}/{I}_next_{J}_cars') save_image(next_state[1 + J * 4], args.batch_size, f'{kargs["output_dir"]}/{I}_next_{J}_tavuks') save_image(next_state[2 + J * 4], args.batch_size, f'{kargs["output_dir"]}/{I}_next_{J}_cross') save_image(next_state[3 + J * 4], args.batch_size, f'{kargs["output_dir"]}/{I}_next_{J}_carpisma') print(loss) if td_model.include_derivative: loss = loss[0] sw.add([loss], I) network_saver.on_step() td_exponent.on_step() target_network_update.on_step() if target_network_update.is_step() == 0: td_model.v_model_eval.set_weights(td_model.v_model.get_weights())
args = parser.parse_args() from PIL import Image from envs.gym_env import gym_env from envs.env_transform import WarmUp, ActionRepeat, ObservationStack from utils.preprocess import * from runner.runner import Runner from agents.agent import DqnAgent, DqnAgentOps, EGreedyOps, EGreedyAgent from utils.memory import ReplayBuffer, NStepBuffer from nets.net import TabularQModel, DqnOps, init_nn_library import tensorflow as tf import keras.backend as K from envs.env import GridEnv init_nn_library(True, "0") if args.game == "Grid": env = GridEnv() else: env = gym_env(args.game) #print(env.observation_space.n) modelOps = DqnOps(env.action_count) modelOps.dueling_network = args.dueling_dqn modelOps.INPUT_SIZE = env.observation_space.n modelOps.LEARNING_RATE = 0.2 q_model = TabularQModel(modelOps)