def main(test=False): if test: dqn = DQN() dqn.test(test_case_count=10000, load_dir='models/dqn.pkl') else: dqn = DQN() env = Env() # dqn.load("models/pretrained.pkl") print('\nCollecting experience...') for i_episode in range(60000): s = env.reset() ep_r = 0 for _count in range(4): root_action, leaf_action = dqn.choose_action(s) # take action s_, r, done = env.step(root_action, leaf_action) dqn.store_transition(s, (root_action, leaf_action), r, s_) ep_r += r if dqn.memory_counter > MEMORY_CAPACITY: dqn.learn() if done: break s = s_ # print('ep_r:', ep_r) if i_episode % 1000 == 1: dqn.test() dqn.save('models/dqn_final_no_pretrain.pkl')
def test_attack(): agent = Agent(args.img_stack, device) agent.load_param() env = Env(args.seed, args.img_stack, args.action_repeat) # load adv input, by default general attack perturbation delta_s = np.load('param/adv_general.npy') if args.attack_type != 'general': file_path = 'param/adv_' + args.attack_type if args.attack_type == 'patch': file_path += '_' + args.patch_type file_path += '.npy' delta_s = np.load(file_path) # show adv fig = plt.figure(figsize=(8, 8)) plt.title('Stack of ' + str(args.img_stack) + ' adversarial signals seen by Agent') plt.axis('off') columns, rows = args.img_stack // 2, args.img_stack // 2 for i in range(1, columns * rows + 1): # denormalize while showing the image img = (delta_s[i - 1] + 1) * 128 fig.add_subplot(rows, columns, i) plt.imshow(img, cmap='gray') plt.show() for i_ep in range(10): score = 0 state = env.reset() for t in range(1000): # steps range to render attack in 1000 attack_render = [30, 40] if t in np.arange(attack_render[0], attack_render[1] + 1): if t in attack_render: s_with_ds = (state + delta_s) # clip the image limits and denormalize for displaying s_with_ds = np.clip(s_with_ds, -1, 0.9921875) s_with_ds = (s_with_ds + 1) * 128 title = 'Attack Started' if t == attack_render[ 0] else 'Attack ended' title += ' (showing first frame of 4 frames visible to policy)' plt.imshow(s_with_ds[0], cmap='gray') plt.axis('off') plt.title(title) plt.show() state += delta_s action = agent.select_action(state) state_, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.])) if args.render: env.render() score += reward state = state_ if done: break print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
def get_children_state(self): new_states = [] env=Env(mid=True,mid_state=self.cur_state) actions = np.argwhere(env.feasible_actions) for action in actions: new_state = env.get_new_state(action) new_states.append([new_state[:, :, 0]]) return new_states
def dfs_search(target=None): env = Env() dfs = DFS() initial_and_target_state = env.get_current_state() start_time = time.clock() * 1000 success = dfs.dfs(env) print(dfs.action) end_time = time.clock() * 1000 print('time: {} ms'.format(end_time - start_time)) return success, end_time - start_time, initial_and_target_state, dfs.action
def Astart(heuristic,lambda_): open=[] close=[] ctg_list = [] new_goal_states,new_goal_path=generate_goal_states() play_env=Env() init_states_flatten = DCAnet.state_to_nnet_input([play_env.state[:,:,0]]) open.append(Node(play_env.state[:,:,0],None,heuristic([init_states_flatten]),0)) cur_expand_node = open.pop(0) while cur_expand_node.cur_state.tolist() not in new_goal_states: close.append(cur_expand_node) child_states=cur_expand_node.get_children_state() if child_states !=[]: child_states_flatten=DCAnet.state_to_nnet_input(child_states) child_ctg=heuristic(child_states_flatten) for child,ctg in zip(child_states,child_ctg): if (np.count_nonzero(child[0] == 1))>=10: child_node=Node(child[0],cur_expand_node,ctg,1+cur_expand_node.cost) open.append(child_node) ctg_list.append(ctg+(1+cur_expand_node.cost)*lambda_) min_ctg=np.argmin(ctg_list) ctg_list.pop(min_ctg) cur_expand_node=open.pop(min_ctg) play_env = Env(mid=True, mid_state=cur_expand_node.cur_state) if len(close)%10000==0: print('Progress log:') print('Length of close = %s' %(len(close))) # if cur_expand_node.get_children_state()==[]: # print('DEAD END ENCOUNTER, Num of pegs left is %s'%play_env.n_pegs) # print(cur_expand_node.cur_state) # print('\n') previous_path_idx=new_goal_states.index(cur_expand_node.cur_state.tolist()) previous_path=new_goal_path[previous_path_idx] path=[] path.append(cur_expand_node.cur_state) while cur_expand_node.parent!=None: path.append(cur_expand_node.parent.cur_state) cur_expand_node=cur_expand_node.parent return previous_path,path,len(close)+len(open)
def populate_buffer(agent, n_workers, buffer): env = Env() agents = [agent for _ in range(n_workers)] pool = ThreadPool(n_workers) while len(buffer.buffer) < buffer.capacity: results = pool.map(collect_random_data, agents) for data in results: shuffle(data) buffer.add_list(data) pool.close() pool.join()
def generate_dataset_from_raw(self, raw_data_file_path): raw_data = np.load(raw_data_file_path) dataset = [] for item in raw_data: state = item[0] init_target = state[int(len(state) / 2):] actions = item[1] env = Env(target_state=init_target) for action in actions: dataset.append({"state": state, "action": action}) state, reward, done = env.step(action[0], action[1]) if done: break return dataset
def generate_dataset(raw_data_file_path): raw_data = np.load(raw_data_file_path) dataset = [] for item in raw_data: state = item[0] init_target = state[int(len(state) / 2):] actions = item[1] env = Env(target_state=init_target) for action in actions: dataset.append((state, action)) state, reward, done = env.step(action[0], action[1]) if done: break np.save('dataset/dataset_2.npy', dataset) print(dataset)
def test(self, test_case_count=200, load_dir=None): self.target_net = self.target_net.eval() if load_dir is not None: self.target_net.load_state_dict(torch.load(load_dir)) count = 0 total_length = 0 for _ in tqdm(range(test_case_count)): env = Env() s = env.get_current_state() ep_r = 0 for i in range(4): x = torch.unsqueeze(torch.FloatTensor(s), 0) # input only one sample root_result, leaf_result = self.target_net(x) root_action = torch.argmax(root_result).item() if root_action != 3: leaf_action = torch.argmax(leaf_result[root_action]).item() # step s_, r, done = env.step(root_action, leaf_action) else: find_path_result = leaf_result[3] find_path_source = torch.argmax( find_path_result[:, :int(find_path_result.shape[1] / 2)]).item() find_path_target = torch.argmax( find_path_result[:, int(find_path_result.shape[1] / 2):]).item() # step s_, r, done = env.step( root_action, (find_path_source, find_path_target)) ep_r += r s = s_ if done: if ep_r > 0: total_length += i break if ep_r > 0: count += 1 acc = float(count) / test_case_count if acc > self.max_acc and load_dir is None: torch.save(self.target_net.state_dict(), 'models/dqn.pkl') self.max_acc = acc print("acc is: ", acc) if count > 0: # 因为统计的时候少1,这里补上1 print("length is: ", float(total_length) / count + 1)
def play2(self, arg): step_for = 16 step_bck = 15 match = False for_inter_state = [] bck_inter_state = [] _, _, heuristic = arg while match == False: env_for = Env() env_bck = DCAEnv() non_ter = False for _ in range(step_for): action = self.naive_policy(env_for, heuristic, env_for.feasible_actions) _, _, end = env_for.step(action) if end: non_ter = True break if non_ter: for_inter_state.append(np.ones((7, 7)).tolist()) else: for_inter_state.append(env_for.state[:, :, 0].tolist()) non_ter = False for _ in range(step_bck): action = self.naive_policy(env_bck, heuristic, env_bck.feasible_actions, False) _, _, end = env_bck.step(action) if end: non_ter = True break if non_ter: bck_inter_state.append(np.ones((7, 7)).tolist()) else: bck_inter_state.append(env_bck.state.tolist()) if env_for.state[:, :, 0].tolist() in bck_inter_state: match = True if env_bck.state.tolist() in for_inter_state: match = True if len(for_inter_state) == 1000: return 1000, False return len(for_inter_state), True
def run_agent(): agent = Agent(args.img_stack, device) agent.load_param() env = Env(args.seed, args.img_stack, args.action_repeat) state = env.reset() # Prepare attack attack = AdvAttack(args.attack_type) attack.initialize_perturbation(state.shape) attack.load_networks() for i_ep in range(50): score = 0 state = env.reset() for t in range(1000): action = agent.select_action(state) # update buffer for training the attack attack.update_buffer(state) # write to tensorboard input_imgs_to_net = torch.tensor( (attack.buffer['s'] + attack.buffer['d_s'])) input_imgs_grid = make_grid(input_imgs_to_net[0].reshape( 4, 1, 96, 96)) writer.add_image('Four stack of input state with adversarial', input_imgs_grid) writer.add_graph(attack.net, input_imgs_to_net) writer.close() # train attack attack.train() state_, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.])) if args.render: env.render() score += reward state = state_ if done or die: break print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
def main(): env = Env(enable_draw=True, base_fix=False) agent = Agent(env) time_horizon = 10 com_pos = np.array([0.0, 0, 0.1]) rpy = np.zeros(3) com_vel = np.zeros(3) base_ang_vel = np.zeros(3) target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) target_x = target_x.reshape((-1, 1)) target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape( (12, 1)) init_u_list = np.array([target_u for i in range(time_horizon)]) state = env.reset() t = 0 while t < 10: com_pos = env.model.com_pos rpy = env.model.base_rpy com_vel = env.model.base_vel base_ang_vel = np.matmul(env.model.base_rot.T, env.model.base_ang_vel) init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) init_x = init_x.reshape((-1, 1)) delta_time_list = np.array([0.01] * time_horizon) foot_pos_list = np.array( [env.model.foot_pos_list for i in range(time_horizon + 1)]) contact_phi_list = np.array([[1, 1, 1, 1] for i in range(time_horizon + 1)]) target_x_list = np.array([target_x for i in range(time_horizon + 1)]) target_u_list = np.array([target_u for i in range(time_horizon)]) action, u_list = agent.get_action(init_x, init_u_list, delta_time_list, foot_pos_list, contact_phi_list, target_x_list, target_u_list) init_u_list = deepcopy(u_list) state = env.step(action) #time.sleep(env.time_step) t += env.time_step
def beam_search(net, beam_size=3): states = [] probs = [] trajectories = [] env = Env() for i in range(beam_size): states.append(deepcopy(env)) probs.append(1.0) trajectories.append([]) for _ in range(4): candidate_states = [] for k in range(beam_size): s = states[k].get_current_state() x = torch.unsqueeze(torch.FloatTensor(s), 0) # input only one sample actions_value = net(x) candidates = topk_actions(actions_value, beam_size) for i in range(beam_size): # step env = deepcopy(states[k]) action = candidates[i][1] temp_traj = copy.copy(trajectories[k]) temp_traj.append((states[k].get_current_state(), action)) s_, r, done = env.step(action[0], action[1]) new_state = env if r > 0: return True, temp_traj # if (new_state, candidates[i][0] * probs[k], temp_traj) not in candidate_states: # candidate_states.append((new_state, candidates[i][0] * probs[k], temp_traj)) candidate_states.append( (new_state, candidates[i][0] * probs[k], temp_traj)) candidate_states = sorted(candidate_states, key=lambda x: x[1], reverse=True) for i in range(beam_size): states[i] = candidate_states[i][0] probs[i] = candidate_states[i][1] trajectories[i] = candidate_states[i][2] return False, None
def play_greedy_game(verbose=True): """ This function plays a Tichu game with four "greedy" players. Uses greedyAgent. This is an Agent with very simple heuristic play moves. Always tries to win a stack except opponent is leading. Raises an Exception if 10 consecutive false moves are made. (This should not happen when environment and greedyAgent is bugfree.) """ agent = greedyAgent() env = Env(train_mode=not (verbose)) state, rewards, done, active_player = env.reset() conseq_active_counter = 0 cummulative_reward = [0, 0, 0, 0] while True: my_state = state[active_player] action = agent.act(my_state) last_active = active_player if not env.game.players[active_player].finished: cummulative_reward[active_player] += rewards[active_player] state, rewards, done, active_player = env.step(active_player, action) new_active = active_player if last_active == new_active: conseq_active_counter += 1 else: conseq_active_counter = 0 if done: if verbose: print('-----') for i in range(4): cummulative_reward[i] += rewards[i] if verbose: print('Cummulative reward of player {}: {}'.format( i, cummulative_reward[i])) return if conseq_active_counter > 10: raise Exception( "Active counter exceeded. Possible infinity loop detected.")
def _env_init(self): self.env = Env()
def _env_init(self): self.env = Env() self.env.check()
from net import Net from history import History from env.env import Env import settings logger = getLogger('train') logger.setLevel(DEBUG) logger.addHandler(StreamHandler()) if __name__ == '__main__': servers_count = 3 containers_count = 5 tf.reset_default_graph() #Очищаем граф tensorflow env = Env(servers_count=servers_count, containers_count=containers_count) net = Net(learning_rate=settings.LEARNING_RATE, input_count=env.state_size, output_count=env.actions_count, hidden_count=settings.NN_HIDDEN_COUNT) history = History() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) gradBuffer = sess.run(tf.trainable_variables()) for i, grad in enumerate(gradBuffer):
import torch from network.FullConnected import Net import env.action as action_def from env.env import Env env = Env() N_STATES = env.n_state class GrammarNet(torch.nn.Module): def forward(self, state): root_result = self.root_grammar(state) delete_node_reslt = self.delete_node_grammar(state) delete_edge_result = self.delete_edge_grammar(state) filter_result = self.filter_grammar(state) find_path_result = self.find_path_grammar(state) return root_result, [ delete_node_reslt, delete_edge_result, filter_result, find_path_result ] def __init__(self): super().__init__() self.root_grammar = Net(n_state=N_STATES, n_action=len(action_def.action)) self.delete_node_grammar = Net(n_state=N_STATES, n_action=len( action_def.delete_node_action)) self.delete_edge_grammar = Net(n_state=N_STATES, n_action=len(
def test_env_state(): env = Env() state, _, _, _ = env.reset() assert np.shape(state) == (4, 4, 3) for i in range(4): assert sum(state[i][0][2]) == 14
def main(): env = Env(enable_draw=True, base_fix=False) agent = Agent(env) delta_time = 0.025 time_horizon = 10 com_pos = np.array([0.0, 0, 0.25]) rpy = np.zeros(3) com_vel = np.zeros(3) base_ang_vel = np.zeros(3) target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) target_x = target_x.reshape((-1, 1)) target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape( (12, 1)) init_u_list = np.array([target_u for i in range(time_horizon)]) temp_length = int(0.3 / delta_time) temp_contact_phi_list = [[0, 1, 1, 0]] * temp_length + [[ 1, 1, 1, 1 ]] * temp_length + [[1, 0, 0, 1]] * temp_length + [[1, 1, 1, 1] ] * temp_length total_contact_phi_list = np.array([[1, 1, 1, 1]] * temp_length + temp_contact_phi_list * 1000) state = env.reset() t = 0 last_t = 0 while t < 100: if last_t == 0 or t - last_t >= delta_time: last_t = t com_pos = env.model.com_pos print(com_pos) rpy = env.model.base_rpy com_vel = env.model.base_vel base_ang_vel = np.matmul(env.model.base_rot.T, env.model.base_ang_vel) init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel]) init_x = init_x.reshape((-1, 1)) delta_time_list = np.array([delta_time] * time_horizon) foot_pos_list = np.array( [env.model.foot_pos_list for i in range(time_horizon + 1)]) contact_phi_list = total_contact_phi_list[:time_horizon + 1] total_contact_phi_list = total_contact_phi_list[1:] target_x_list = np.array( [target_x for i in range(time_horizon + 1)]) target_u_list = np.array([target_u for i in range(time_horizon)]) action, u_list = agent.get_action(init_x, init_u_list, delta_time_list, foot_pos_list, contact_phi_list, target_x_list, target_u_list) init_u_list = deepcopy(u_list) for leg_idx in range(4): if contact_phi_list[0, leg_idx] == 0.0: action[leg_idx * 3:(leg_idx + 1) * 3] = [0, 0, -3.0] state = env.step(action, contact_phi_list[0, :]) t += env.time_step
def main(): config = read_config("config.yaml") agent_config = config['Agent'] network_config = agent_config['Network'] training_config = config['Training'] files_config = config['Files'] eval_config = config['Evaluation'] print('\t\t --------------------------------------------') print('\t\t ------ Parameters of the experiment ------') print('\t\t --------------------------------------------\n') print('## Agent params') print('Agent : ' + agent_config['name']) print('Gamma : ', agent_config['gamma']) print('') print('## Network Params') print('Network used : ' + network_config['name']) print('Number of filters : ', network_config['n_filters']) print('activation function : ' + network_config['activation']) print('state embedding size : ', network_config['state_embedding_size']) print('') print('## Training params') print('Number of iteration : ', training_config['n_iter']) print('Learning rate : ', network_config['lr']) print('Number of games per iteration : ', training_config['n_games']) print('Number of workers : ', training_config['n_workers']) print('Batch size : ', training_config['batch_size']) print('Buffer size : ', training_config['buffer_size']) print('') print('## Evaluation params') print('Number of games per iteration : ', eval_config['n_games']) print('Number of workers : ', eval_config['n_workers']) print('') sleep(2.0) # Init files and tensorboard model_name = agent_config['name'] checkpoints_dir = os.path.join(model_name, files_config['checkpoints_dir']) tensorboard_log_dir = os.path.join(model_name, files_config['tensorboard_log_dir']) results_log_path = os.path.join(model_name, files_config['results_log_path']) # fix random seed if config['Seed'] is None: np.random.seed(seed=42) else: np.random.seed(int(seed)) print('\n\n') env = Env() # if train from scratch if training_config["init_checkpoint"] == 0: # initialize dir for tensorboard flush_or_create(tensorboard_log_dir) # initialize dir for checkpoitns flush_or_create(checkpoints_dir) # init agent and network from scratch agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir, tensorboard_log_dir) # initialize iteration number start = 0 # else restart training from last checkpoint else: agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir, tensorboard_log_dir, restore=True) print('\nnetwork restored from checkpoint # ', latest_checkpoint) print('') start = latest_checkpoint # intialize the summary writer and results log file log_file = open(results_log_path, "wb+") # open log file to write in during evaluation display_every = training_config["display_every"] n_games_train = training_config["n_games"] n_workers_train = training_config["n_workers"] T_update_net = training_config["T_update_net"] T_update_target_net = training_config["T_update_target_net"] n_games_eval = eval_config["n_games"] n_workers_eval = eval_config["n_workers"] prefill_buffer = training_config["prefill_buffer"] # gamma = agent_config['gamma'] summary_dict = dict({}) data_buffer = Buffer(capacity=training_config['buffer_size']) logger = logging.getLogger(__name__) if prefill_buffer: # populate buffer with intial data from random games print('\nPopulating Buffer ... \n') populate_buffer(agent, n_workers_train, data_buffer) print('\n\n') print('Starting training\n\n') batch_size = training_config['batch_size'] for it in tqdm(np.arange(start, training_config["n_iter"]), desc="parallel gameplay iterations"): # play games to generate data and train the network env.reset() try: agent.train(env, n_games_train, data_buffer, batch_size, n_workers_train, display_every, T_update_net) except Exception as error: print('\n\n#### AN ERROR OCCURED WHILE TRAINING ####\n\n') agent.net.summary_writer.close() agent.net.sess.close() log_file.close() logger.error(error) raise agent.net.save_checkpoint(checkpoints_dir, it=it + 1) # play games with latest checkpoint and track average final reward results = agent.evaluate(env, n_games_eval, n_workers_eval) # save results pickle.dump(results, log_file) print('') agent.net.summary_writer.close() agent.net.sess.close() log_file.close() print('End of training')
def collect_random_data(agent): env = Env() random_agent = RandomAgent() end = False states = [] actions = [] rewards = [] data = [] discount_G = 1.0 G = 0. t = 0 while not end: states.append(env.state) action = random_agent.select_action(env.feasible_actions) action_index = 4 * action[0] + action[1] actions.append(action_index) reward, _, end = env.step(action) rewards.append(reward) # discount = gamma # for s in range(t): # values[t-s-1] += discount * reward # discount = discount * gamma t += 1 G += discount_G * reward discount_G = discount_G * agent.gamma R = 0. # evaluate state values of all states encountered in a batch to save time state_values = agent.net.get_value( np.array(states).reshape(-1, 7, 7, agent.state_channels)).reshape(-1) for s in range(t): R = rewards[t - s - 1] + agent.gamma * R advantage = R - state_values[t - s - 1] data = [ dict({ "state": states[t - s - 1], "advantage": advantage, "action": actions[t - s - 1], "critic_target": R }) ] + data assert (G == R) assert (len(state_values) == len(states) == len(actions) == len(rewards) == t) # data = [] # for s in range(len(states)-1): # advantage = rewards[s] + values[s+1] - values[s] # data.append(dict({"state" : states[s], # "advantage" : advantage, # "critic_target" : values[s], # "action" : actions[s]})) # T = len(states)-1 # advantage = rewards[T] - values[T] # next state value is 0 because it is terminal # data.append(dict({"state" : states[T], # "advantage" : advantage, # "critic_target" : values[T], # "action" : actions[T]})) return data