def main(test=False): if test: dqn = DQN() dqn.test(test_case_count=10000, load_dir='models/dqn.pkl') else: dqn = DQN() env = Env() # dqn.load("models/pretrained.pkl") print('\nCollecting experience...') for i_episode in range(60000): s = env.reset() ep_r = 0 for _count in range(4): root_action, leaf_action = dqn.choose_action(s) # take action s_, r, done = env.step(root_action, leaf_action) dqn.store_transition(s, (root_action, leaf_action), r, s_) ep_r += r if dqn.memory_counter > MEMORY_CAPACITY: dqn.learn() if done: break s = s_ # print('ep_r:', ep_r) if i_episode % 1000 == 1: dqn.test() dqn.save('models/dqn_final_no_pretrain.pkl')
def test_attack(): agent = Agent(args.img_stack, device) agent.load_param() env = Env(args.seed, args.img_stack, args.action_repeat) # load adv input, by default general attack perturbation delta_s = np.load('param/adv_general.npy') if args.attack_type != 'general': file_path = 'param/adv_' + args.attack_type if args.attack_type == 'patch': file_path += '_' + args.patch_type file_path += '.npy' delta_s = np.load(file_path) # show adv fig = plt.figure(figsize=(8, 8)) plt.title('Stack of ' + str(args.img_stack) + ' adversarial signals seen by Agent') plt.axis('off') columns, rows = args.img_stack // 2, args.img_stack // 2 for i in range(1, columns * rows + 1): # denormalize while showing the image img = (delta_s[i - 1] + 1) * 128 fig.add_subplot(rows, columns, i) plt.imshow(img, cmap='gray') plt.show() for i_ep in range(10): score = 0 state = env.reset() for t in range(1000): # steps range to render attack in 1000 attack_render = [30, 40] if t in np.arange(attack_render[0], attack_render[1] + 1): if t in attack_render: s_with_ds = (state + delta_s) # clip the image limits and denormalize for displaying s_with_ds = np.clip(s_with_ds, -1, 0.9921875) s_with_ds = (s_with_ds + 1) * 128 title = 'Attack Started' if t == attack_render[ 0] else 'Attack ended' title += ' (showing first frame of 4 frames visible to policy)' plt.imshow(s_with_ds[0], cmap='gray') plt.axis('off') plt.title(title) plt.show() state += delta_s action = agent.select_action(state) state_, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.])) if args.render: env.render() score += reward state = state_ if done: break print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
def run_agent(): agent = Agent(args.img_stack, device) agent.load_param() env = Env(args.seed, args.img_stack, args.action_repeat) state = env.reset() # Prepare attack attack = AdvAttack(args.attack_type) attack.initialize_perturbation(state.shape) attack.load_networks() for i_ep in range(50): score = 0 state = env.reset() for t in range(1000): action = agent.select_action(state) # update buffer for training the attack attack.update_buffer(state) # write to tensorboard input_imgs_to_net = torch.tensor( (attack.buffer['s'] + attack.buffer['d_s'])) input_imgs_grid = make_grid(input_imgs_to_net[0].reshape( 4, 1, 96, 96)) writer.add_image('Four stack of input state with adversarial', input_imgs_grid) writer.add_graph(attack.net, input_imgs_to_net) writer.close() # train attack attack.train() state_, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.])) if args.render: env.render() score += reward state = state_ if done or die: break print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
def main(): env = Env(enable_draw=True, base_fix=False) agent = Agent(env) time_horizon = 10 com_pos = np.array([0.0, 0, 0.1]) rpy = np.zeros(3) com_vel = np.zeros(3) base_ang_vel = np.zeros(3) target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) target_x = target_x.reshape((-1, 1)) target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape( (12, 1)) init_u_list = np.array([target_u for i in range(time_horizon)]) state = env.reset() t = 0 while t < 10: com_pos = env.model.com_pos rpy = env.model.base_rpy com_vel = env.model.base_vel base_ang_vel = np.matmul(env.model.base_rot.T, env.model.base_ang_vel) init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) init_x = init_x.reshape((-1, 1)) delta_time_list = np.array([0.01] * time_horizon) foot_pos_list = np.array( [env.model.foot_pos_list for i in range(time_horizon + 1)]) contact_phi_list = np.array([[1, 1, 1, 1] for i in range(time_horizon + 1)]) target_x_list = np.array([target_x for i in range(time_horizon + 1)]) target_u_list = np.array([target_u for i in range(time_horizon)]) action, u_list = agent.get_action(init_x, init_u_list, delta_time_list, foot_pos_list, contact_phi_list, target_x_list, target_u_list) init_u_list = deepcopy(u_list) state = env.step(action) #time.sleep(env.time_step) t += env.time_step
def play_greedy_game(verbose=True): """ This function plays a Tichu game with four "greedy" players. Uses greedyAgent. This is an Agent with very simple heuristic play moves. Always tries to win a stack except opponent is leading. Raises an Exception if 10 consecutive false moves are made. (This should not happen when environment and greedyAgent is bugfree.) """ agent = greedyAgent() env = Env(train_mode=not (verbose)) state, rewards, done, active_player = env.reset() conseq_active_counter = 0 cummulative_reward = [0, 0, 0, 0] while True: my_state = state[active_player] action = agent.act(my_state) last_active = active_player if not env.game.players[active_player].finished: cummulative_reward[active_player] += rewards[active_player] state, rewards, done, active_player = env.step(active_player, action) new_active = active_player if last_active == new_active: conseq_active_counter += 1 else: conseq_active_counter = 0 if done: if verbose: print('-----') for i in range(4): cummulative_reward[i] += rewards[i] if verbose: print('Cummulative reward of player {}: {}'.format( i, cummulative_reward[i])) return if conseq_active_counter > 10: raise Exception( "Active counter exceeded. Possible infinity loop detected.")
def main(): config = read_config("config.yaml") agent_config = config['Agent'] network_config = agent_config['Network'] training_config = config['Training'] files_config = config['Files'] eval_config = config['Evaluation'] print('\t\t --------------------------------------------') print('\t\t ------ Parameters of the experiment ------') print('\t\t --------------------------------------------\n') print('## Agent params') print('Agent : ' + agent_config['name']) print('Gamma : ', agent_config['gamma']) print('') print('## Network Params') print('Network used : ' + network_config['name']) print('Number of filters : ', network_config['n_filters']) print('activation function : ' + network_config['activation']) print('state embedding size : ', network_config['state_embedding_size']) print('') print('## Training params') print('Number of iteration : ', training_config['n_iter']) print('Learning rate : ', network_config['lr']) print('Number of games per iteration : ', training_config['n_games']) print('Number of workers : ', training_config['n_workers']) print('Batch size : ', training_config['batch_size']) print('Buffer size : ', training_config['buffer_size']) print('') print('## Evaluation params') print('Number of games per iteration : ', eval_config['n_games']) print('Number of workers : ', eval_config['n_workers']) print('') sleep(2.0) # Init files and tensorboard model_name = agent_config['name'] checkpoints_dir = os.path.join(model_name, files_config['checkpoints_dir']) tensorboard_log_dir = os.path.join(model_name, files_config['tensorboard_log_dir']) results_log_path = os.path.join(model_name, files_config['results_log_path']) # fix random seed if config['Seed'] is None: np.random.seed(seed=42) else: np.random.seed(int(seed)) print('\n\n') env = Env() # if train from scratch if training_config["init_checkpoint"] == 0: # initialize dir for tensorboard flush_or_create(tensorboard_log_dir) # initialize dir for checkpoitns flush_or_create(checkpoints_dir) # init agent and network from scratch agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir, tensorboard_log_dir) # initialize iteration number start = 0 # else restart training from last checkpoint else: agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir, tensorboard_log_dir, restore=True) print('\nnetwork restored from checkpoint # ', latest_checkpoint) print('') start = latest_checkpoint # intialize the summary writer and results log file log_file = open(results_log_path, "wb+") # open log file to write in during evaluation display_every = training_config["display_every"] n_games_train = training_config["n_games"] n_workers_train = training_config["n_workers"] T_update_net = training_config["T_update_net"] T_update_target_net = training_config["T_update_target_net"] n_games_eval = eval_config["n_games"] n_workers_eval = eval_config["n_workers"] prefill_buffer = training_config["prefill_buffer"] # gamma = agent_config['gamma'] summary_dict = dict({}) data_buffer = Buffer(capacity=training_config['buffer_size']) logger = logging.getLogger(__name__) if prefill_buffer: # populate buffer with intial data from random games print('\nPopulating Buffer ... \n') populate_buffer(agent, n_workers_train, data_buffer) print('\n\n') print('Starting training\n\n') batch_size = training_config['batch_size'] for it in tqdm(np.arange(start, training_config["n_iter"]), desc="parallel gameplay iterations"): # play games to generate data and train the network env.reset() try: agent.train(env, n_games_train, data_buffer, batch_size, n_workers_train, display_every, T_update_net) except Exception as error: print('\n\n#### AN ERROR OCCURED WHILE TRAINING ####\n\n') agent.net.summary_writer.close() agent.net.sess.close() log_file.close() logger.error(error) raise agent.net.save_checkpoint(checkpoints_dir, it=it + 1) # play games with latest checkpoint and track average final reward results = agent.evaluate(env, n_games_eval, n_workers_eval) # save results pickle.dump(results, log_file) print('') agent.net.summary_writer.close() agent.net.sess.close() log_file.close() print('End of training')
def main(): env = Env(enable_draw=True, base_fix=False) agent = Agent(env) delta_time = 0.025 time_horizon = 10 com_pos = np.array([0.0, 0, 0.25]) rpy = np.zeros(3) com_vel = np.zeros(3) base_ang_vel = np.zeros(3) target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) target_x = target_x.reshape((-1, 1)) target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape( (12, 1)) init_u_list = np.array([target_u for i in range(time_horizon)]) temp_length = int(0.3 / delta_time) temp_contact_phi_list = [[0, 1, 1, 0]] * temp_length + [[ 1, 1, 1, 1 ]] * temp_length + [[1, 0, 0, 1]] * temp_length + [[1, 1, 1, 1] ] * temp_length total_contact_phi_list = np.array([[1, 1, 1, 1]] * temp_length + temp_contact_phi_list * 1000) state = env.reset() t = 0 last_t = 0 while t < 100: if last_t == 0 or t - last_t >= delta_time: last_t = t com_pos = env.model.com_pos print(com_pos) rpy = env.model.base_rpy com_vel = env.model.base_vel base_ang_vel = np.matmul(env.model.base_rot.T, env.model.base_ang_vel) init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) init_x = init_x.reshape((-1, 1)) delta_time_list = np.array([delta_time] * time_horizon) foot_pos_list = np.array( [env.model.foot_pos_list for i in range(time_horizon + 1)]) contact_phi_list = total_contact_phi_list[:time_horizon + 1] total_contact_phi_list = total_contact_phi_list[1:] target_x_list = np.array( [target_x for i in range(time_horizon + 1)]) target_u_list = np.array([target_u for i in range(time_horizon)]) action, u_list = agent.get_action(init_x, init_u_list, delta_time_list, foot_pos_list, contact_phi_list, target_x_list, target_u_list) init_u_list = deepcopy(u_list) for leg_idx in range(4): if contact_phi_list[0, leg_idx] == 0.0: action[leg_idx * 3:(leg_idx + 1) * 3] = [0, 0, -3.0] state = env.step(action, contact_phi_list[0, :]) t += env.time_step
def test_env_state(): env = Env() state, _, _, _ = env.reset() assert np.shape(state) == (4, 4, 3) for i in range(4): assert sum(state[i][0][2]) == 14
output_count=env.actions_count, hidden_count=settings.NN_HIDDEN_COUNT) history = History() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) gradBuffer = sess.run(tf.trainable_variables()) for i, grad in enumerate(gradBuffer): gradBuffer[i] = 0 state = env.reset() old_action = None for iter_num in range(settings.TRAIN_ITERATIONS): action = sess.run(net.chosen_action, feed_dict={net.layer_input: [state]})[0] # if action == old_action and action != 0: # action += 1 old_action = action logger.debug( f'Iter {iter_num}: action={action} (avg reward={np.mean(history.get_rewards()[-100:])})' )