def record_model(args_path, weight_path): """Record a sequence of episodes from a neural network. Args: args (dict): Relevant options to play the episodes. weight_path (str): Path of the neural network parameters to load. Returns: dict: Sequence of episode images buffer. """ with open(args_path, 'r') as file_args: args = json.load(file_args) args['--gui'] = True print(args) # Experiment options episode_count = 5 width, height = 300, 300 steps_per_second = 20 # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') device = 'cpu' # We create the environment env = make_intrusion_env(args) env.seed(args['--train_seed']) # We create the actor-critic network action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] actor_net = PolicyNetwork(state_dim, action_dim, args['--hidden']).to(device) # We load the weights checkpoint = torch.load(weight_path, map_location=device) actor_net.load_state_dict(checkpoint['actor_model_state_dict']) actor_net.eval() # We view the model data = {} with torch.no_grad(): for episode in range(episode_count): done = False data['episode_{}'.format(episode)] = [] state = env.reset(intruder_position=[100, 100]) episode_reward = 0 step = 0 while not done: step += 1 print('step', step) img = Image.fromarray(env.render(mode='rgb_array')) data['episode_{}'.format(episode)].append(encode_img(img)) model_state = torch.FloatTensor(state).to(device) action = actor_net(model_state).detach().cpu().numpy() # action = ou_noise.get_action(action, step) state, reward, done, _ = env.step(action) env.close() return data
def train_model(args): seed_experiment(args['--train_seed']) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # We create the environment env = make_intrusion_env(args) args['--port'] = env.link.connector.simu_port parser._write_options(args['--exp'], 'exp_options.json', args) args["--save_interval_performance"] = 1000 qdrl_algo = QDRLAlgo(args, device, env) qdrl_algo.train_model()
def record_architecture(args): """Construct an image of the neural network architecture. Args: args (dict): Relevant options to construct the image. Returns: dict: Visualize result of the defined neural network. """ # We create the environment env = make_intrusion_env(args) # We create the actor-critic network action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] actor_net = PolicyNetwork(state_dim, action_dim, args['--hidden']) critic_net = ActionValueNetwork(state_dim, action_dim, args['--hidden']) # We send the architecture image return { **actor_net.visualize(state_dim), **critic_net.visualize(state_dim, action_dim) }
def plot_ac(args, actors, device): fig = plt.figure() fig.set_dpi(100) fig.set_size_inches(7, 6.5) number_of_actors = len(actors) ax = plt.axes(xlim=(-100, 100), ylim=(-100, 100)) global done_gv done_gv = [False] * number_of_actors envs = [make_intrusion_env(args) for _ in range(number_of_actors)] global state_gv state_gv = [envi.reset() for envi in envs] goal = plt.Circle((0, 0), 5, fc='r') obstacles = [] for i, obstacle in enumerate(envs[0].space.obstacle_list): obstacles.append( plt.Rectangle((obstacle.center.x - obstacle.dimensions.x / 2, obstacle.center.y - obstacle.dimensions.y / 2), obstacle.dimensions.x, obstacle.dimensions.y, fc='g')) intruders = [None] * number_of_actors for id_env, envi in enumerate(envs): for i, intruder_id in envi.action_intruder.items(): entity_obj = envi.space.get_pedestrian(intruder_id) if entity_obj is not None: intruders[id_env] = plt.Circle( (entity_obj.geometry.position.x, entity_obj.geometry.position.y), 1, fc='C' + str(id_env)) def init(): for obs in obstacles: ax.add_patch(obs) for intruder in intruders: ax.add_patch(intruder) ax.add_patch(goal) return [goal] + obstacles + intruders def animate(i): global done_gv global state_gv for id_env, envi in enumerate(envs): if not done_gv[id_env]: action = actors[id_env](state_gv[id_env]) next_state, reward, done_, info_ = envi.step(action) state_gv[id_env] = next_state # print('done', done_, next_state[0],next_state[1],math.sqrt(((next_state[0]-.5)*100)**2+((next_state[1]-.5)*100)**2)) done_gv[id_env] = done_ for i, intruder_id_ in envi.action_intruder.items(): entity_obj_ = envi.space.get_pedestrian(intruder_id_) if entity_obj_ is not None: intruders[id_env].center = ( entity_obj_.geometry.position.x, entity_obj_.geometry.position.y) else: intruders[id_env].center = (.5, .5) return [goal] + obstacles + intruders anim = animation.FuncAnimation(fig, animate, init_func=init, frames=360, interval=20, blit=True) plt.show() [envo.close() for envo in envs]
def train_model(args): seed_experiment(args['--train_seed']) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # We create the environment env = make_intrusion_env(args) args['--port'] = env.link.connector.simu_port parser._write_options(args['--exp'], 'exp_options.json', args) args["--save_interval_performance"] = 1000 action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] # We create the action noise process for exploration around the behavior policy # TODO: handle conditional parsers # if args["--noise"] == 'Gaussian': # noise_process_exploration = Gaussian(env.action_space, args['--g_min_sigma'], args['--g_max_sigma'], # decay_period=args['--g_decay']) if args["--noise"] == 'epsilon_greedy': noise_process_exploration = EpsilonGreedy( args['--epsilon_start'], args['--epsilon_end'], args['--decay_period'], action_space=env.action_space) # We create the value and policy networks as well as their target critic_net1, critic_net2, target_critic_net1, target_critic_net2 = [ ActionValueNetwork(state_dim, action_dim, args['--hidden']).to(device) for _ in range(4) ] actor_net, target_actor_net = (PolicyNetwork(state_dim, action_dim, args['--hidden']).to(device), PolicyNetwork(state_dim, action_dim, args['--hidden']).to(device)) # We create the optimizers actor_optimizer = torch.optim.Adam(actor_net.parameters(), lr=args['--policy_lr']) critic1_optimizer = torch.optim.Adam(critic_net1.parameters(), lr=args['--value_lr']) critic2_optimizer = torch.optim.Adam(critic_net2.parameters(), lr=args['--value_lr']) # We initialize the target models to be identical to the other models soft_update(critic_net1, target_critic_net1, soft_tau=1.) soft_update(critic_net2, target_critic_net2, soft_tau=1.) soft_update(actor_net, target_actor_net, soft_tau=1.) # We create the replay buffer if args["--replay_buffer_kickstart_file"] is not None: replay_buffer = ReplayBuffer.load_from_file( args["--replay_buffer_kickstart_file"]) else: replay_buffer = ReplayBuffer(args['--buffer']) # We create the criterion td3_criterion = TD3Criterion(actor_net, target_actor_net, critic_net1, critic_net2, target_critic_net1, target_critic_net2, gamma=args['--gamma'], soft_tau=args['--soft_tau'], noise_std=args['--g_smooth_sigma'], noise_clip=args['--g_smooth_clip'], device=device) # We prepare the experiment exp_options = { 'episode_reward_train': { 'plot': 'line', 'yscale': 'linear' }, 'episode_reward_test': { 'plot': 'line', 'yscale': 'linear' }, 'episode_reward_test_sparse': { 'plot': 'line', 'yscale': 'linear' }, 'episode_reward_test_sparse_ring': { 'plot': 'line', 'yscale': 'linear' }, 'score_train': { 'plot': 'line', 'yscale': 'linear' }, 'score_test': { 'plot': 'line', 'yscale': 'linear' }, 'score_test_sparse': { 'plot': 'line', 'yscale': 'linear' }, 'score': { 'plot': 'line', 'yscale': 'linear' }, 'actor_loss': { 'plot': 'line', 'yscale': 'log' }, 'critic_loss_1': { 'plot': 'line', 'yscale': 'log' }, 'critic_loss_2': { 'plot': 'line', 'yscale': 'log' }, } agent_id = 0 description = 'TD3: {} with {} frames for training'.format( args['--env_name'], args['--budget']) exp_id = create_experiment(args['--exp'], description, './', exp_options) print('exp_id', exp_id) storage = ExpLoggerAgent( exp_id, agent_id, os.path.join(args['--exp'], 'agent_0'), { 'critic_model1': critic_net1, 'critic_model2': critic_net2, 'actor_model': actor_net }, { 'critic1_optimizer': critic1_optimizer, 'critic2_optimizer': critic2_optimizer, 'actor_optimizer ': actor_optimizer }) reward_buffer_test = deque(maxlen=100) reward_buffer_test_sparse = deque(maxlen=100) reward_buffer_test_sparse_ring = deque(maxlen=100) reward_buffer_train = deque(maxlen=100) # We train the networks step_idx = 0 episode_idx = 0 episode_reward_train = 0 state = env.reset() step_idx_in_episode = 0 while step_idx < args['--budget']: actor_net.eval() # Do one step in the environment and save information model_state = torch.FloatTensor(state).to(device) action = actor_net(model_state).detach().cpu().numpy() action = noise_process_exploration.get_action(action, t=step_idx) next_state, reward, done, _ = env.step(action) if not done or step_idx_in_episode != 0: replay_buffer.push(state, action, reward, next_state, done) episode_reward_train += reward # Train/Update the actor and critic based on resampling transitions from the replay buffer if step_idx % args['--delay_policy_update'] == 0: actor_net.train() critic_net1.train() critic_net2.train() if len(replay_buffer) > args['--batch_size']: # Sample from the relay buffer state_replay, action_replay, reward_replay, next_state_replay, done_replay = replay_buffer.sample( args['--batch_size']) # Compute, store and optimize the losses critic_loss1, critic_loss2, actor_loss = td3_criterion.loss( state_replay, action_replay, reward_replay, next_state_replay, done_replay) critic1_optimizer.zero_grad() critic_loss1.backward(retain_graph=True, inputs=list(critic_net1.parameters())) critic2_optimizer.zero_grad() critic_loss2.backward(retain_graph=True, inputs=list(critic_net2.parameters())) if step_idx % args['--delay_policy_update'] == 0: actor_optimizer.zero_grad() actor_loss.backward(inputs=list(actor_net.parameters())) critic1_optimizer.step() critic2_optimizer.step() if step_idx % args['--delay_policy_update'] == 0: actor_optimizer.step() soft_update(critic_net1, target_critic_net1, args['--soft_tau']) soft_update(critic_net2, target_critic_net2, args['--soft_tau']) soft_update(actor_net, target_actor_net, args['--soft_tau']) # Save and print performance information if step_idx % args[ "--save_interval_performance"] == 0 and step_idx > 1 and len( reward_buffer_test) > 1: storage.performance( step_idx, { 'critic_loss_1': critic_loss1.item(), 'critic_loss_2': critic_loss2.item(), 'actor_loss': actor_loss.item() }) mean_reward_train = sum(reward_buffer_train) / len( reward_buffer_train) mean_reward_test = sum(reward_buffer_test) / len( reward_buffer_test) mean_reward_test_sparse = sum(reward_buffer_test_sparse) / len( reward_buffer_test_sparse) storage.performance(step_idx, {'score_train': mean_reward_train}) storage.performance(step_idx, {'score_test': mean_reward_test}) storage.performance(step_idx, {'score_test_sparse': mean_reward_test_sparse}) storage.write() print('Loss at {}/{}: value1={:.4}, value2={:.4}, policy={:.4}.'. format(step_idx, args['--budget'], critic_loss1.item(), critic_loss2.item(), actor_loss.item())) print('Result train at {}/{}: {}.'.format(step_idx, args['--budget'], mean_reward_train)) print('Result test at {}/{}: {}.'.format(step_idx, args['--budget'], mean_reward_test)) # save the weights of the model if step_idx % args['--save_interval'] == 0: storage.state(step_idx) # do not forget to update time and state step_idx += 1 step_idx_in_episode += 1 state = next_state if done: # the episode came to an end. episode_idx += 1 step_idx_in_episode = 0 storage.performance(step_idx, {'episode_reward_train': episode_reward_train}) reward_buffer_train.append(episode_reward_train) episode_reward_train = 0 if episode_idx % args["--test_frequency"] == 0: # Testing the learned policy on one episode actor_net.eval() total_number_test = 1 episode_reward_test = 0 episode_reward_test_sparse = 0 episode_reward_test_sparse_ring = 0 for test_number in range(total_number_test): state_test = env.reset( intruder_position=args['--intruder_position_test'], reward=args['--reward']) episode_reward_test += one_episode(state_test, device, actor_net, env) for test_number in range(total_number_test): state_test = env.reset( intruder_position=args['--intruder_position_test'], reward='sparse') episode_reward_test_sparse += one_episode( state_test, device, actor_net, env) for test_number in range(total_number_test): state_test = env.reset(intruder_position='ring', reward='sparse') episode_reward_test_sparse_ring += one_episode( state_test, device, actor_net, env) normalized_episode_reward_test = episode_reward_test / total_number_test normalized_episode_reward_test_sparse = episode_reward_test_sparse / total_number_test normalized_episode_reward_test_sparse_ring = episode_reward_test_sparse_ring / total_number_test reward_buffer_test.append(normalized_episode_reward_test) reward_buffer_test_sparse.append( normalized_episode_reward_test_sparse) reward_buffer_test_sparse_ring.append( normalized_episode_reward_test_sparse_ring) storage.performance( step_idx, {'episode_reward_test': episode_reward_test}) storage.performance( step_idx, {'episode_reward_test_sparse': episode_reward_test_sparse}) storage.performance( step_idx, { 'episode_reward_test_sparse_ring': episode_reward_test_sparse_ring }) state = env.reset( intruder_position=args['--intruder_position_train'], reward=args['--reward']) env.close() storage.state(step_idx) print('Loss at {}/{}: value1={:.4}, value2={:.4}, policy={:.4}.'.format( step_idx, args['--budget'], critic_loss1.item(), critic_loss2.item(), actor_loss.item())) storage.close() stop_experiment(exp_id)
weight_path= sys.argv[2] print('rfji', args_path, weight_path) with open(args_path) as json_file: args = json.load(json_file) # Parse the options parser = TrainExperimentParser(ParserIntrusion(ParserIntruder(), ParserGuard(), ParserFixedGuard()), ParserQDRLIntrusion(ParserEpsilonGreedy())) _args = parser.parse() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # We create the environment env = make_intrusion_env(_args) # TESTT from PIL import Image state = env.reset() renderer = Renderer(env.unwrapped) # We create the actor-critic network action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] actor_net = PolicyNetwork(state_dim, action_dim, _args['--hidden']).to(device) # We load the weights checkpoint = torch.load(weight_path, map_location=device) actor_net.load_state_dict(checkpoint['actor_model_state_dict']) actor_net.eval()