n_games = 1000 gammas = [float(args.gamma)] mem_sizes = [int(args.memory)] epsilon_decs = [float(args.eps_decay)] for gamma in gammas: for mem_size in mem_sizes: for epsilon_dec in epsilon_decs: print('Gamma {} - Mem Size {} - Epsilon Decay {}'.format( gamma, mem_size, epsilon_dec)) agent = Agent(gamma=gamma, epsilon=1.0, lr=lr, input_dims=env.observation_space.shape, n_actions=env.action_space.n, mem_size=mem_size, epsilon_dec=epsilon_dec, batch_size=64, epsilon_end=0.01, saveModel='models/best_model.h5') scores = [] avg_scores = [] eps_history = [] iter_time = [] avg_times = [] mem_full = [] for i in range(n_games): start = time.time() done = False score = 0
plt.title('NTN (6472)') plt.gcf().autofmt_xdate() plt.grid() plt.plot(x, y) def print_result(obj, bpw, hold): print(obj.t.strftime("%Y-%m-%d"), '{:<5}'.format(str(obj.stock_price_t)), f'Agent: (BuyPower, Stock)=({bpw:.0f}, {hold})') if __name__ == "__main__": s_time = '2020-01-11' # 取引開始日 e_time = '2020-11-11' # 取引終了日 sim = Simulator(s_time, e_time) # シミュレーター生成 agent = Agent() # エージェント生成 buy_pw_agent = 100000 # 買い付け余力 stockholdings_agent = 0 # 保有株式数 while (sim.e_time > sim.t): sim.next_day() date = sim.t.strftime("%Y-%m-%d") price = sim.stock_price_t if price != 'None': # 土日,祝日はスキップ # エージェントの意思決定 agent.make_decision(price, buy_pw_agent, stockholdings_agent, date) # 取引 if agent.action == 'BUY': tmp_bp = buy_pw_agent - agent.volume * price tmp_sh = stockholdings_agent + agent.volume
def setUp(self): unittest.TestCase.setUp(self) agent = Agent() self.agents = [agent,]
parser.add_argument('env_id', nargs='?', default='CartPole-v1', help='Select the environment to run') args = parser.parse_args() # You can set the level to logger.DEBUG or logger.WARN if you # want to change the amount of output. logger.set_level(logger.INFO) env = gym.make(args.env_id) rewards = [] #env = wrappers.Monitor(env, force=True) #env.seed(0) agent = Agent(env) episode_count = 200 reward = 0 done = False for i in range(episode_count): ob = env.reset() prev_ob = ob episode_reward = 0 while True: action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) agent.learn(prev_ob, action, ob, reward, done) prev_ob = ob episode_reward += reward
def getAgent(self): """ Return the agent implemented by this algorithm. """ return Agent(name=self.__str__(), strategy=(lambda i, s: self.getAction(i, s)))
import tornado.web from tornado import gen import os import base64 import re import json import numpy as np import time import hyperparameters as hp from agent import Agent from action import Action from PIL import Image from io import BytesIO static_path = os.path.join(os.getcwd(), "static") agent = Agent() class MainHandler(tornado.web.RequestHandler): def get(self): self.redirect("/static/v2.curves.html") # self.redirect("/static/v4.final.html") class FrameHandler(tornado.web.RequestHandler): def post(self): data = json.loads(self.get_arguments("telemetry")[0]) ar = np.fromstring(base64.decodestring(self.request.body), dtype=np.uint8) image = ar.reshape(hp.INPUT_SIZE, hp.INPUT_SIZE, hp.NUM_CHANNELS) left, right, faster, slower = data["action"]
def design_agent_and_env(FLAGS): """ 1. DESIGN AGENT The key hyperparameters for agent construction are a. Number of levels in agent hierarchy b. Max sequence length in which each policy will specialize c. Max number of atomic actions allowed in an episode d. Environment timesteps per atomic action See Section 3 of this file for other agent hyperparameters that can be configured. """ FLAGS.layers = 3 # Enter number of levels in agent hierarchy FLAGS.time_scale = 10 # Enter max sequence length in which each policy will specialize # Enter max number of atomic actions. This will typically be FLAGS.time_scale**(FLAGS.layers). However, in the UR5 Reacher task, we use a shorter episode length. max_actions = 500 # max_actions = 15 timesteps_per_action = 15 # Provide the number of time steps per atomic action. """ 2. DESIGN ENVIRONMENT a. Designer must provide the original UMDP (S,A,T,G,R). - The S,A,T components can be fulfilled by providing the Mujoco model. - The user must separately specifiy the initial state space. - G can be provided by specifying the end goal space. - R, which by default uses a shortest path {-1,0} reward function, can be implemented by specifying two components: (i) a function that maps the state space to the end goal space and (ii) the end goal achievement thresholds for each dimensions of the end goal. b. In order to convert the original UMDP into a hierarchy of k UMDPs, the designer must also provide - The subgoal action space, A_i, for all higher-level UMDPs i > 0 - R_i for levels 0 <= i < k-1 (i.e., all levels that try to achieve goals in the subgoal space). As in the original UMDP, R_i can be implemented by providing two components:(i) a function that maps the state space to the subgoal space and (ii) the subgoal achievement thresholds. c. Designer should also provide subgoal and end goal visualization functions in order to show video of training. These can be updated in "display_subgoal" and "display_end_goal" methods in the "environment.py" file. """ # Provide file name of Mujoco model(i.e., "pendulum.xml"). Make sure file is stored in "mujoco_files" folder model_name = "ant_reacher.xml" # Provide initial state space consisting of the ranges for all joint angles and velocities. In the Ant Reacher task, we use a random initial torso position and use fixed values for the remainder. initial_joint_pos = np.array([ 0, 0, 0.55, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -1.0, 0.0, -1.0, 0.0, 1.0 ]) initial_joint_pos = np.reshape(initial_joint_pos, (len(initial_joint_pos), 1)) initial_joint_ranges = np.concatenate( (initial_joint_pos, initial_joint_pos), 1) initial_joint_ranges[0] = np.array([-9.5, 9.5]) initial_joint_ranges[1] = np.array([-9.5, 9.5]) # Cocatenate velocity ranges initial_state_space = np.concatenate( (initial_joint_ranges, np.zeros( (len(initial_joint_ranges) - 1, 2))), 0) # Provide end goal space. The code supports two types of end goal spaces if user would like to train on a larger end goal space. If user needs to make additional customizations to the end goals, the "get_next_goal" method in "environment.py" can be updated. # In the UR5 reacher environment, the end goal will be the desired joint positions for the 3 main joints. max_range = 9.5 goal_space_train = [[-max_range, max_range], [-max_range, max_range], [0.45, 0.55]] goal_space_test = [[-max_range, max_range], [-max_range, max_range], [0.45, 0.55]] # Provide a function that maps from the state space to the end goal space. This is used to (i) determine whether the agent should be given the sparse reward and (ii) for Hindsight Experience Replay to determine which end goal was achieved after a sequence of actions. project_state_to_end_goal = lambda sim, state: state[:3] # Set end goal achievement thresholds. If the agent is within the threshold for each dimension, the end goal has been achieved and the reward of 0 is granted. # For the Ant Reacher task, the end goal will be the desired (x,y) position of the torso len_threshold = 0.5 height_threshold = 0.2 end_goal_thresholds = np.array( [len_threshold, len_threshold, height_threshold]) # Provide range for each dimension of subgoal space in order to configure subgoal actor networks. Subgoal space can be the same as the state space or some other projection out of the state space. # The subgoal space in the Ant Reacher task is the desired (x,y,z) position and (x,y,z) translational velocity of the torso cage_max_dim = 11.75 max_height = 1 max_velo = 3 subgoal_bounds = np.array([[-cage_max_dim, cage_max_dim], [-cage_max_dim, cage_max_dim], [0, max_height], [-max_velo, max_velo], [-max_velo, max_velo]]) # Provide state to subgoal projection function. # a = np.concatenate((sim.data.qpos[:2], np.array([4 if sim.data.qvel[i] > 4 else -4 if sim.data.qvel[i] < -4 else sim.data.qvel[i] for i in range(3)]))) project_state_to_subgoal = lambda sim, state: np.concatenate( (sim.data.qpos[:2], np.array([1 if sim.data.qpos[2] > 1 else sim.data.qpos[2]]), np.array([ 3 if sim.data.qvel[i] > 3 else -3 if sim.data.qvel[i] < -3 else sim.data.qvel[i] for i in range(2) ]))) # Set subgoal achievement thresholds velo_threshold = 0.5 quat_threshold = 0.5 # subgoal_thresholds = np.array([len_threshold, len_threshold, height_threshold, quat_threshold, quat_threshold, quat_threshold, quat_threshold, velo_threshold, velo_threshold, velo_threshold]) subgoal_thresholds = np.array([ len_threshold, len_threshold, height_threshold, velo_threshold, velo_threshold ]) # To properly visualize goals, update "display_end_goal" and "display_subgoals" methods in "environment.py" """ 3. SET MISCELLANEOUS HYPERPARAMETERS Below are some other agent hyperparameters that can affect results, including a. Subgoal testing percentage b. Subgoal penalty c. Exploration noise d. Replay buffer size """ agent_params = {} # Define percentage of actions that a subgoal level (i.e. level i > 0) will test subgoal actions agent_params["subgoal_test_perc"] = 0.3 # Define subgoal penalty for missing subgoal. Please note that by default the Q value target for missed subgoals does not include Q-value of next state (i.e, discount rate = 0). As a result, the Q-value target for missed subgoal just equals penalty. For instance in this 3-level UR5 implementation, if a level proposes a subgoal and misses it, the Q target value for this action would be -10. To incorporate the next state in the penalty, go to the "penalize_subgoal" method in the "layer.py" file. agent_params["subgoal_penalty"] = -FLAGS.time_scale # Define exploration noise that is added to both subgoal actions and atomic actions. Noise added is Gaussian N(0, noise_percentage * action_dim_range) agent_params["atomic_noise"] = [0.2 for i in range(8)] agent_params["subgoal_noise"] = [ 0.2 for i in range(len(subgoal_thresholds)) ] # Define number of episodes of transitions to be stored by each level of the hierarchy agent_params["episodes_to_store"] = 500 # Provide training schedule for agent. Training by default will alternate between exploration and testing. Hyperparameter below indicates number of exploration episodes. Testing occurs for 100 episodes. To change number of testing episodes, go to "ran_HAC.py". agent_params["num_exploration_episodes"] = 100 # For other relavent agent hyperparameters, please refer to the "agent.py" and "layer.py" files # Ensure environment customization have been properly entered check_validity(model_name, goal_space_train, goal_space_test, end_goal_thresholds, initial_state_space, subgoal_bounds, subgoal_thresholds, max_actions, timesteps_per_action) # Instantiate and return agent and environment env = Environment(model_name, goal_space_train, goal_space_test, project_state_to_end_goal, end_goal_thresholds, initial_state_space, subgoal_bounds, project_state_to_subgoal, subgoal_thresholds, max_actions, timesteps_per_action, FLAGS.show) agent = Agent(FLAGS, env, agent_params) return agent, env
# But only the last row provides new information to each state, so we could simply get those values state = states[0].reshape(3, 8)[-1] table = BT() table.column_headers = state_vector_names table.append_row(state.tolist()) print(table) env.close() # Test Agent # ---------- state_size, action_size = brain.vector_observation_space_size, brain.vector_action_space_size agent = Agent(num_agents=num_agents, state_size=state_size, action_size=action_size) print('Capacity of the Actor (# of parameters): ', count_parameters(agent.actor_local)) print('Capacity of the Critic (# of parameters): ', count_parameters(agent.critic_local)) # Training # -------- #@timeit def train(env): ''' Trains on an environment '''
def train(env): ''' Trains on an environment ''' global EPISODES global MAX_ITERS global PRINT_EVERY global LEARN_PERIOD global NUM_SAMPLES print('Loading environmnet...\n') env = UnityEnvironment(file_name=ENV) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] print('Loading agent...\n') num_agents = len(env_info.agents) state_size, action_size = brain.vector_observation_space_size, brain.vector_action_space_size agent = Agent(num_agents=num_agents, state_size=state_size, action_size=action_size) print('Capacity of the Actor (# of parameters): ', count_parameters(agent.actor_local)) print('Capacity of the Critic (# of parameters): ', count_parameters(agent.critic_local)) last_100_mean = [] scores_global = [] scores_concur = deque(maxlen=PRINT_EVERY) try: print('Initializing training...\n') for e in range(1, EPISODES + 1): # Initialize Episode scores = np.zeros(num_agents) env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # get the current state (for each agent) agent.reset() t0 = time.time() # Run episode maximum until MAX_ITERS for i in range(MAX_ITERS): # Select an action for each Agent actions = agent.act(states) env_info = env.step(actions)[brain_name] # Observe result of the action next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # Store score result scores += env_info.rewards # Make a step on the environment for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): agent.step(state, action, reward, next_state, done) if i % PRINT_EVERY == 0: print('\rStep {}\tAverage Score: {:.2f}'.format( i, np.mean(scores)), end="") # Learn from experiences on the Replay Buffer if i % LEARN_PERIOD == 0: for sample in range(NUM_SAMPLES): agent.sampleandlearn() # End of the episode if any of the agents is done if np.any(dones): break # Roll over states to next time step states = next_states # agent.sampleandlearn() deltatime = time.time() - t0 score = np.mean(scores) scores_concur.append(score) scores_global.append(score) print('\rEpisode {}, Average last 100 scores: {:.2f}, Episode Duration: {:.2f}, \n'\ .format(e, np.mean(scores_concur), deltatime)) # If last 100 episodes average score is the best 100 average seen - Save Models if np.mean(scores_concur) > last_100_mean: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_{}.pth'.format(e)) torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_{}.pth'.format(e)) # Update current 100 mean last_100_mean = np.mean(scores_concur) print('Closing envionment...\n') env.close() return agent, scores_global # If errors, close environment except: env.close() print('There were some error wile training') return None, None
env = gym.make('LunarLander-v2') env.seed(0) print('State shape: ', env.observation_space.shape) print('Number of actions: ', env.action_space.n) PRIORIIZED_REPLAY = True DUELING = True DDQN = True for ddqn in [True]: for dueling in [True]: for prioritized_replay in [True]: print('Using:') print(' * DDNQ: ', ddqn) print(' * DUELING: ', dueling) print(' * PRIORITIZED_REPLAY: ', prioritized_replay) agent = Agent(state_size=8, action_size=4, seed=0, prioritized_replay=prioritized_replay, dueling=dueling, ddqn=ddqn) scores = dqn(n_episodes=2000) # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() for i in range(5): state = env.reset() for j in range(2000): action = agent.act(state) env.render()
current_NN.model.set_weights(m_tmp.get_weights()) best_NN.model.set_weights(m_tmp.get_weights()) #otherwise just ensure the weights on the two players are the same else: best_player_version = 0 best_NN.model.set_weights(current_NN.model.get_weights()) #copy the config file to the run folder copyfile('./config.py', run_folder + 'config.py') plot_model(current_NN.model, to_file=run_folder + 'models/model.png', show_shapes = True) print('\n') ######## CREATE THE PLAYERS ######## current_player = Agent('current_player', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, current_NN) best_player = Agent('best_player', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, best_NN) #user_player = User('player1', env.state_size, env.action_size) iteration = 0 start = time.time() for i in config.LOOP: while 1: iteration += 1 reload(lg) reload(config) print('ITERATION NUMBER ' + str(iteration)) lg.logger_main.info('BEST PLAYER VERSION: %d', best_player_version)
if __name__=="__main__": # currently runs 100 games of the agent against a random player. Takes about a minte random.seed(2) win=0 loss=0 tie=0 turn=0 n=10 for i in range(n): game=Board() A=Agent(game) print("Game {}".format(i)) while not game.end: A.make_move() game.random_move() if game.winner=='A': win+=1 turn+=game.turn elif game.winner=='B': loss+=1 elif game.winner is None: tie+=1
gamma = 0.999 eps_start = 1 eps_end = 0.01 eps_decay = 0.001 target_update = 10 #update the target network every 10 episode memory_size = 100000 lr = 0.001 #learning rate num_episodes = 1000 #set the device use cpu or gpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #enviorement manager em = CartPoleEnvManager(device) #create the strategy strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay) #create agent agent = Agent(strategy, em.num_actions_available(), device) #create replay memory memory = ReplayMemory(memory_size) #create policy network and target network #pass height and width to create appropriate input shape policy_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device) target_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
""" Get the player move as an index in the board """ valid_moves = ["a1", "a2", "a3", "b1", "b2", "b3", "c1", "c2", "c3"] while True: move = input("Where would you like to go? (a2, b1, c3, ...): ") if move in valid_moves: return valid_moves.index(move) else: print("Invalid move: " + move) comp = X human = O agent = Agent(plays=comp, episodes=10_000) if __name__ == "__main__": playing = True while playing: env = TicTacToe() # initialise game env.render() # if user does not go first then the agent makes a prediction if input("Would you like to go first? (yes/no): ").lower() != "yes": action = agent.predict(env) env.step(action, player=comp) env.render() game_over = False while not game_over:
def main(): HYPERPARAMS = { "breakout": { #"env_name": "BreakoutNoFrameskip-v4", "env_name": "PongNoFrameskip-v4", "gamma": 0.99, "learning_rate": 0.003, "entropy_beta": 0.03, "batch_size": 128, "accumulation_steps": 10, "n_envs": 5, "reward_steps": 4, "stop_reward": 500, "adam_eps": 1e-3, } } params = HYPERPARAMS["breakout"] device = T.device("cuda" if T.cuda.is_available() else "cpu") writer = SummaryWriter("run") env = GymEnvVec(params["env_name"], params["n_envs"]) net = A2C(env.envs[0].observation_space.shape, env.envs[0].action_space.n) optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"], eps=params["adam_eps"]) agent = Agent(net, params["batch_size"], params["entropy_beta"]) exp_source = ExperienceSourceFirstLast(env, agent, params["gamma"], params["reward_steps"]) batch = [] with RewardTracker(writer, stop_reward=params["stop_reward"]) as tracker: for step, exp in enumerate(exp_source): batch.append(exp) #This part is only used to track the total reward. #If new_reward=True, it means the episode is done new_reward = exp_source.pop_total_reward() if new_reward: if tracker.reward(new_reward[0], step): break if len(batch) < params["batch_size"]: continue # Output the tuple (batch_states, batch_actions, batch_qvals) batch_args = unpack_batch(batch, net, params["gamma"], params["reward_steps"], device=device) batch.clear() optimizer.zero_grad() import ipdb; ipdb.set_trace() #kwargs = agent.learn(step, *batch_args, optimizer) writer.add_scalar("advantage", kwargs["adv"].mean(), step) writer.add_scalar("values", kwargs["critic_values"].mean(), step) writer.add_scalar("batch_rewards", kwargs["batch_qvals"].mean(), step) writer.add_scalar("loss_entropy", kwargs["entropy_loss"], step) writer.add_scalar("loss_policy", kwargs["actor_loss"], step) writer.add_scalar("loss_value", kwargs["actor_loss"], step) writer.add_scalar("loss_total", kwargs["loss"], step) writer.add_scalar("grad_l2", np.sqrt(np.mean(np.square(kwargs["grads"]))), step) writer.add_scalar("grad_max", np.max(np.abs(kwargs["grads"])), step) writer.add_scalar("grad_var", np.var(kwargs["grads"]), step)
def __init__(self, state_size, action_size, random_seed): self.agents = [Agent(state_size, action_size, 1, random_seed) for _ in range(2)]
communicationAPI.do_action(playerID, 1, self.initial_move(is_first=False)) else: communicationAPI.do_action(playerID, 1, self.initial_move(is_first=True)) self.update_resources() self.initial_move_opponent( communicationAPI.do_action(playerID, gameID, self.initial_move(is_first=False))) state = self.configure_state() while not self.is_terminated(): self.update_resources() action = agent.get_action(state) new_state, new_position = self.update_state_after_my_action(action) send_action = action if action == 'move' or action == 'buildroad' or action == 'upgradetown': if new_position is not None: send_action = send_action + ' ' + str(new_position) self.update_resources() opp_action = communicationAPI.do_action(playerID, 1, send_action) new_state = self.update_state_after_opp_action(opp_action) state = new_state if __name__ == '__main__': game = Game(Map(), Agent(epsilon=0.1, gamma=0.9, alpha=1)) game.train()
l_gamma = [1] run = 0 with open('restuls_6.txt', 'w+') as inf: for epsilon in l_epsilon: for epsilon_decay in l_epsilon_decay: for epsilon_min in l_epsilon_min: for alpha in l_alpha: for gamma in l_gamma: run += 1 inf.write( f'\n\nrun : {run} ================================================' ) agent = Agent(epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min, alpha=alpha, gamma=gamma) inf.write(f'\nepsilon: {agent.epsilon}' f'\nepsilon_decay: {agent.epsilon_decay}' f'\nepsilon_min: {agent.epsilon_min}' f'\nalpha: {agent.alpha}' f'\ngamma: {agent.gamma}') avg_rewards, best_avg_reward = interact(env, agent) inf.write(f'\nBest avg reward: {best_avg_reward}') # break # break
args.device = torch.device('cpu') # Simple ISO 8601 timestamped logger def log(s): print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s) # Environment env = Env(args) env.train() action_space = env.action_space() # Agent dqn = Agent(args, env) mem = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size: if done: state, done = env.reset(), False next_state, _, done = env.step(np.random.randint(0, action_space)) val_mem.append(state, None, None, done) state = next_state T += 1
win = window.Window(width=500, height=500, vsync=True, resizable=True) glEnable(GL_BLEND) glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) # needed so that egi knows where to draw egi.InitWithPyglet(win) # prep the fps display fps_display = clock.ClockDisplay() # register key and mouse event handlers win.push_handlers(on_key_press) win.push_handlers(on_mouse_press) win.push_handlers(on_resize) # create a world for agents world = World(500, 500) # add one agent world.agents.append(Agent(world)) world.obstacles.append(Obstacle(world)) # unpause the world ready for movement print("Controls: A to add an agent, O to add an object to the map, C to reset objects on the map, P to pause, and I to show direction info.") world.paused = False while not win.has_exit: win.dispatch_events() glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) # show nice FPS bottom right (default) delta = clock.tick() world.update(delta) world.render() fps_display.draw() # swap the double buffer win.flip()
if __name__ == '__main__': # create environment object env = Environment() memory_fp = '/Users/ryan.osgar/Documents/repos/data_science/trex_memory/memory.pkl' save_path = '/Users/ryan.osgar/Documents/repos/data_science/noisy_model/model-weights' mem_length = 80000 agent = Agent(env, tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse', memory_length=mem_length, dueling=True, noisy_net=True, egreedy=True, save_memory=memory_fp, save_weights=save_path, verbose_action=True) agent.load_weights(save_path) agent.load_memory(memory_fp) agent.set_beta_schedule(beta_start=0.9, beta_max=1, annealed_samplings=2000) # agent.set_epsilon_decay_schedule(0.000001, 0.0000001, 100) agent.pretraining_steps = 0 print(f'pretraining for {agent.pretraining_steps} steps...')
def _create_agents(config_list): """ Create agents with different hyper-parameters. Parameters ---------- config_list : list of dict List of parameters dict. Each dict has configurations such as model name, learning rate, etc.. Returns ------- Created agents list and core agent object. """ try: agents = [] for config in config_list: hyper_parameters = utils.Hyperparameter( batch_size=config["batch_size"], gamma=config["gamma"], eps_start=config["eps_start"], eps_end=config["eps_end"], eps_decay=config["eps_decay"], target_update=config["target_update"], default_durability=config["default_durability"], learning_rate=config["learning_rate"], initial_memory=config["initial_memory"], n_episode=config["n_episode"], n_actions=config["n_action"], default_durability_decreased_level=config[ "default_durability_decreased_level"], default_durability_increased_level=config[ "default_durability_increased_level"], default_check_frequency=config["default_check_frequency"], default_healing_frequency=config["default_healing_frequency"], env_name=config["env_name"], exp_name=config["exp_name"], render=config["render"], run_name=config["run_name"], output_directory_path=config["output_directory_path"], hyper_dash=config["hyper_dash"], model_saving_frequency=config["model_saving_frequency"], parameters_name=config["name"], roulette_mode=config["roulette_mode"], max_reward=config["max_reward"], min_reward=config["min_reward"]) print(config["name"]) if config["name"] != "core": if config["model"] == "DQN": policy_net = models.DQN(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.DQN(n_actions=4).to( hyper_parameters.DEVICE) elif config["model"] == "DDQN": policy_net = models.DDQN(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.DDQN(n_actions=4).to( hyper_parameters.DEVICE) elif config["model"] == "DQNbn": policy_net = models.DQNbn(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.DQNbn(n_actions=4).to( hyper_parameters.DEVICE) elif config["model"] == "NonBatchNormalizedDQN": policy_net = models.NonBatchNormalizedDQN(n_actions=4).to() target_net = models.NonBatchNormalizedDQN(n_actions=4).to( hyper_parameters.DEVICE) # elif args["model"] == "RamDQN": # policy_net = models.RamDQN(n_actions=4).to(hyper_parameters.DEVICE) # target_net = models.RamDQN(n_actions=4).to(hyper_parameters.DEVICE) else: policy_net = models.DQN(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.DQN(n_actions=4).to( hyper_parameters.DEVICE) optimizer = optim.Adam(policy_net.parameters(), lr=hyper_parameters.LEARNING_RATE) agents.append( Agent(policy_net, target_net, hyper_parameters.DEFAULT_DURABILITY, optimizer, config["name"], hyper_parameters)) else: # For core agent policy_net = models.NonBatchNormalizedDQN(n_actions=4).to( hyper_parameters.DEVICE) target_net = models.NonBatchNormalizedDQN(n_actions=4).to( hyper_parameters.DEVICE) optimizer = optim.Adam(policy_net.parameters(), lr=hyper_parameters.LEARNING_RATE) core_agent = Agent(policy_net, target_net, hyper_parameters.DEFAULT_DURABILITY, optimizer, config["name"], hyper_parameters) print("Agent:{} has been done".format(config["name"])) try: core_agent except Exception as e: print("P_RuntimeError:0x1000 Core agent has not been defined.") tb = sys.exc_info()[2] print(e.with_traceback(tb)) sys.exit(1) return agents, core_agent except Exception as e: print("P_RuntimeError:0x1001 Some arguments is missing.") tb = sys.exc_info()[2] print(e.with_traceback(tb)) sys.exit(1)
# 盤の描画 draw_board(canvas, board) # エージェントの描画 draw_agent(canvas, agent) # 書出し canvas = cv2.cvtColor(canvas, cv2.COLOR_BGR2RGB) # BGRをRGBにする cv2.imwrite(f"./@share/out-istep1-{seq}.png", canvas) seq += 1 return seq BOARD1 = read_screen_csv(FILE_PATH) AGENT1 = Agent() AGENT1.location = BOARD1.start_location[:] # Copy AGENT1.prev_location = BOARD1.start_location[:] # Copy # for (row, columns) in enumerate(BOARD1.rows): # for (column, cell) in enumerate(columns): # print(f"[{column},{row}]={cell}") print("Start...") SEQ1 = 0 SEQ1 = search(SEQ1, BOARD1, AGENT1, AGENT1.location, screenshot_func) # 後ろ向き探索のスクリーンショット screenshot_func(SEQ1, BOARD1, AGENT1) print("Finished.")
'window_size': 50, 'batch_size': 32, 'episode_count': '', 'selected_model': 'baseline' } exclude_variables = ['next_close', 'next_returns', 'done'] args['stock_name'], \ args['window_size'], \ args['episode_count'] = \ sys.argv[1], \ int(sys.argv[2]), \ int(sys.argv[3]) agent = Agent(args['window_size']) data = get_train_data(args['stock_name']) # data = data.head(1000) data['next_close'] = data['close'].shift(-1) data['next_close_diff'] = (data['next_close'] - data['close']) / data['close'] data['prev_close'] = data['close'].shift(1) data['returns_eur'] = ( (data['close'] - data['prev_close']) / data['prev_close']) + 1 data['returns_btc'] = ((1 / data['close'] - 1 / data['prev_close']) / (1 / data['prev_close'])) + 1 data['close_diff'] = (data['close'] - data['prev_close']) / data['prev_close'] data['close_diff'].fillna(0, inplace=True) data['diff'] = data['close'] - data['prev_close']
parser.add_argument( "--wallet", nargs=2, metavar=('walletname', 'walletpass'), help="The name and passphrase of the wallet to connect to.") parser.add_argument("--ephemeralwallet", action="store_true", help="Use ephemeral wallets") args = parser.parse_args() # Configure webapp LOOP = asyncio.get_event_loop() WEBAPP = web.Application() aiohttp_jinja2.setup(WEBAPP, loader=jinja2.FileSystemLoader('view')) AGENT = Agent() POST_MESSAGE_HANDLER = PostMessageHandler(AGENT.message_queue) WEBSOCKET_MESSAGE_HANDLER = WebSocketMessageHandler( AGENT.message_queue, AGENT.outbound_admin_message_queue) PROVISIONAL_CONNECTION_PROTOCOL_MESSAGE_HANLDER = \ ProvisionalConnectionProtocolMessageHandler(AGENT.message_queue) ROUTES = [ web.get('/', root), web.get('/ws', WEBSOCKET_MESSAGE_HANDLER.ws_handler), web.static('/res', 'view/res'), web.post('/indy', POST_MESSAGE_HANDLER.handle_message), web.post( '/offer', PROVISIONAL_CONNECTION_PROTOCOL_MESSAGE_HANLDER.handle_message) ]
# initialize model best_NN = residual_CNN(config.REG_CONST, config.LEARNING_RATE, (2,) + game.grid_shape, game.move_size, config.HIDDEN_CNN_LAYERS) # load model best_version = config.INITIAL_MODEL_VERSION print('Loading model ' + str(best_version) + '...') model_temp = best_NN.read(best_version) best_NN.model.set_weights(model_temp.get_weights()) print('\n') # create players best_player = Agent('best_player', game.state_size, game.move_size, config.MCTS_SIMS, config.CPUCT, best_NN) user_player = User('player1', game.state_size, game.move_size) iteration = 0 play_again = 'yes' while play_again != 'no': print('\n') scores, _, points, sp_scores = play_matches_between_networks(game, -1, best_version, 1, turns_to_tau0=0, goes_first=0) print('\nScores: ') print(scores) print('\nFirst PLAYER / Second PLAYER SCORES') print(sp_scores) print('Play again?') play_again = input()
from domain import Domain from agent import Agent if __name__ == "__main__": setting = int( input("Press 0 for deterministic setting or 1 for stochastic setting")) protocol = int(input("Choose the protocol you want to display: 1, 2 or 3")) domain = Domain() domain.setting = setting domain.update() if protocol == 1: print("Agent1 first protocol") print("-----------------------") agent1 = Agent(domain) agent1.train(100) elif protocol == 2: print("Agent2 second protocol") print("-----------------------") agent2 = Agent(domain) agent2.train2(100) elif protocol == 3: print("Agent3 third protocol") print("-----------------------") agent3 = Agent(domain) agent3.train3(100) else: print("You didn't choose a valid protocol") print("Protocols are between 1 and 3")
from agent import Agent from environment import ALE tf.set_random_seed(123) random.seed(123) init_seed = int(sys.argv[1]) init_rand = int(sys.argv[2]) with tf.Session() as sess: # Init env env = ALE(init_seed, init_rand) # Init agent agent = Agent(sess, env.ale.getMinimalActionSet()) action_repeat, screen_type = agent.getSetting() # Set env setting env.setSetting(action_repeat, screen_type) # Get a new game screen = env.new_game() # Start playing current_reward = 0 for _ in range(5000): action = agent.play(screen) reward, screen, terminal = env.act(action) current_reward += reward if terminal:
translation_field = robot_node.getField("translation") rotation_field = robot_node.getField("rotation") # 初始化左右引擎 left_motor = super_visor.getMotor("left wheel motor") right_motor = super_visor.getMotor("right wheel motor") left_motor.setPosition(float("inf")) right_motor.setPosition(float("inf")) left_motor.setVelocity(0.0) right_motor.setVelocity(0.0) # 设置state, action和agent action_space = 2 state_space = 4 max_reward = torch.tensor([2.0]) agent = Agent(state_space, action_space) # 机器人开始运行 robot_name = robot_node.getDef() print("Robot {} starts!".format(robot_name)) position = torch.tensor(translation_field.getSFVec3f()).unsqueeze(0) orientation = torch.tensor([rotation_field.getSFRotation()[3]]).unsqueeze(0) state = torch.cat((position, orientation), dim=1) # step counter step_count = 0 while super_visor.step(timestep) != -1:
"type": "LSTM", "units": 16, "return_sequences": True }, { "type": "LSTM", "units": 16, "return_sequences": False }, { "type": "Dense", "units": 16, "activation": "relu" }, { "type": "Dense", "units": 16, "activation": "relu" }] q_model = Q_Model("GRU", state_dim=env.get_state().shape, no_of_actions=env.no_of_actions, layers=dense_model, hyperparameters={"lr": 0.0001}) agent = Agent(q_model, batch_size=8, discount_factor=0.8, epsilon=1) no_of_episodes_train = 100 no_of_episodes_test = 100 sim = Simulator(env, agent) sim.train(no_of_episodes_train, epsilon_decay=0.997) agent.model.save() sim.test(no_of_episodes_test)