def corrmaze(c_len): grid = np.zeros((3,c_len+2)) for x in range(1,c_len+1): grid[0][x] = 1 grid[2][x] = 1 agents = [Agent("[255,0]",(0,0),(0,0),(c_len+1,0)),Agent("[0,255]",(c_len+1,2),(c_len+1,2),(0,2))] m = Maze(grid,agents) return m
def test_exercise(capsys): bond = Agent("James", "Bond") print(bond) ionic = Agent("Ionic", "Bond") print(ionic) out, err = capsys.readouterr() assert out == "My name is Bond, James Bond\nMy name is Bond, Ionic Bond\n"
def test_load_parameters(): env = Environment() # instantiate an agent, loading in Q values from models/test agent = Agent(env, load_from_directory=MODELS_DIR / "test", load_N=False) # models/test/Q.csv is all 0's apart from a single 1 at [2, 1] assert agent._Q[2, 1] == 1.0 # check that loading works in a two-step process too # also check that N values are correctly loaded agent = Agent(env) agent.load_parameters(MODELS_DIR / "test", load_N=True) assert agent._Q[2, 1] == 1.0 assert agent._N[2, 1] == 1 # models/test/N.csv follows same pattern
def test_save_parameters(): # create a temporary directory to save into. It gets deleted at the end of the test. with tempfile.TemporaryDirectory() as tmp: save_dir = pathlib.Path(tmp) env = Environment() agent = Agent(env) agent._Q[3, 1] = 1.0 agent._N[3, 1] = 1 agent.save_parameters(to_directory=save_dir) del agent new_agent = Agent(env, load_from_directory=save_dir) assert new_agent._Q[3, 1] == 1.0 assert new_agent._N[3, 1] == 1
def run(load_path, model_path, index, load_model): with tf.device('/gpu:0'): trainer = tf.train.AdamOptimizer(learning_rate=1e-4) global_env = Environment(load_path=load_path, starting_index=index, final_index=index+1) global_net = A3C_Network() agent = Agent(0, global_net.n_inputs_policy, global_net.n_inputs_matching, global_net.n_actions_policy, trainer, load_path, model_path) saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: coord = tf.train.Coordinator() if load_model: ckpt = tf.train.get_checkpoint_state(model_path) saver.restore(sess, ckpt.model_checkpoint_path) sess.run(global_env.index.assign(index)) sess.run(global_env.final_index.assign(index+1)) else: sess.run(tf.global_variables_initializer()) agent_test = lambda: agent.test(sess, coord) t = threading.Thread(target=(agent_test)) t.start() t.join()
def main(test): # init the environment env = UnityEnvironment(file_name=REACHER_APP) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of actions action_size = brain.vector_action_space_size # dimenison of the state space state_size = env_info.vector_observations.shape[1] # number of agents n_agents = len(env_info.agents) # create a DDPG agent agent = Agent(state_size=state_size, action_size=action_size, n_agents=n_agents, random_seed=1) if not test: # train the agent scores = run_agent(env, agent, n_episodes=300) _ = plot_scores(agent, scores) else: # test the trained agent # load the weights from file agent.actor_local.load_state_dict( torch.load(f'weights/{str(agent)}_checkpoint_actor.pth')) agent.critic_local.load_state_dict( torch.load(f'weights/{str(agent)}_checkpoint_critic.pth')) test_agent(env, agent, n_agents) env.close()
def train(self, log_file_dir = "./tensorboard", index = "0"): self.__print_upperbound() self.__init_tensorboard(log_file_dir) starttime = time.time() total_data_time = 0 total_training_time = 0 for i in range(int(self.train_config['steps'])): step_start = time.time() X, w, y, setw = self.next_batch() finish_data = time.time() total_data_time += (finish_data - step_start) self._agent.train_step(X, w, y, setw=setw) total_training_time += time.time() - finish_data if i % 1000 == 0 and log_file_dir: logging.info("average time for data accessing is %s"%(total_data_time/1000)) logging.info("average time for training is %s"%(total_training_time/1000)) total_training_time = 0 total_data_time = 0 self.log_between_steps(i) if self.save_path: best_agent = Agent(self.config, restore_dir=self.save_path) self._agent = best_agent pv_vector, loss, output = self._evaluate("test") pv = self._agent.portfolio_value log_mean = self._agent.log_mean logging.warning('the portfolio value train No.%s is %s log_mean is %s,' ' the training time is %d seconds' % (index, pv, log_mean, time.time() - starttime))
def main(): """ This function will be called for training phase. """ # Sample code for illustration, add your code below to run in test phase. # Load trained model from train/ directory env = gym.make(MINERL_GYM_ENV) if FRAME_SKIP > 0: env = FrameSkip(env, enable_rendering=True) env = ObsWrapper(env) env = MoveAxisWrapper(env, -1, 0) env = CombineActionWrapper(env) agent = Agent(env.observation_space, env.action_space) agent.load_model() for _ in range(MINERL_MAX_EVALUATION_EPISODES): obs = env.reset() done = False netr = 0 while not done: action = agent.act(obs) obs, reward, done, info = env.step(action) netr += reward env.render() env.close()
def pl_grad(): print("hi") environment = gym.make("CartPole-v1") net = nn.Sequential(nn.Linear(4, 40, bias=False), nn.ReLU(), nn.Linear(40, 2, bias=False), nn.Softmax(dim=1)) class distributionNet(nn.Module): def __init__(self): super(distributionNet, self).__init__() self.net = net def forward(self, x): return Categorical(self.net(x)) a_model = distributionNet() optimizer = torch.optim.Adam(a_model.parameters(), lr=0.01) learner = PolicyGradient(environment, a_model, optimizer, discount_factor=0.99) opt_policy, history = learner.learn_policy(epochs=500, episodes_per_update=1) plt.plot(history) plt.xlabel('episode') plt.ylabel('total reward') plt.savefig("score.png") agent = Agent(environment=environment, policy=opt_policy) input("add anything to continue") agent.perform_episode(render=True)
def __init__(self, config, agent=None, save_path=None, restore_dir=None, device="cpu"): self.config = config self.train_config = config["training"] self.input_config = config["input"] self.best_metric = 0 self.save_path = save_path self._matrix = DataMatrices.create_from_config(config) self.time_index = self._matrix._DataMatrices__global_data.time_index.values self.coins = self._matrix._DataMatrices__global_data.coins.values self.test_set = self._matrix.get_test_set() self.training_set = self._matrix.get_training_set() tf.random.set_seed(self.config["random_seed"]) self.device = device self._agent = Agent(config, time_index=self.time_index, coins=self.coins, restore_dir=restore_dir) self.keras_test = self._matrix.keras_batch(data="test")
def plot_grid_2_mc(): test_grids = TEST_GRIDS all_test_list = [(key, grid) for key, grid in test_grids.items()] sorted(all_test_list, key=lambda x: x[0]) agent = Agent() iters = ITERS total_normal_grid_score, total_grid1_score, total_grid2_score, total_grid3_score, total_grid4_score = [],[],[],[],[] repeats = REPEATS # for n in iters: # print("Running iteration {n}".format(n=n)) grid2_score, grid4_score = [], [] for ind, grid_init in all_test_list: normalized_score = 0 for j in range(repeats): grid_num = int(ind) #ind initially is a string. if (grid_num < 200) or (grid_num > 300): continue best_reward = grid_init['best_reward'] testgrid = Grid(5, random=False, init_pos=grid_init) if grid_num in {204, 208}: Q, policy = agent.mc_first_visit_control(testgrid.copy(), iters=500) _, _, mc_reward = agent.run_final_policy(testgrid.copy(), Q, display=True) else: continue normalized_score += mc_reward - best_reward if normalized_score != 0: print( "Grid num {0} did not achieve best score".format(grid_num))
def from_image(file): img = Image.open(file) arr = np.array(img) height, width, _ = arr.shape grid = np.zeros((height, width), dtype=np.uint8) for y in range(height): for x in range(width): if list(arr[y][x]) == [0, 0, 0, 255]: grid[y][x] = 1 agents = AgentPool() for y in range(height): for x in range(width): if arr[y][x][2] == 16: name = Maze.name_from_pixel(arr[y][x]) agents.add(Agent(name)) agents.get(name).set_start((x, y)) for y in range(height): for x in range(width): if arr[y][x][2] == 80: name = Maze.name_from_pixel(arr[y][x]) agents.get(name).add_waypoint((x, y)) if arr[y][x][2] == 128: name = Maze.name_from_pixel(arr[y][x]) agents.get(name).goal = (x, y) return Maze(grid, agents)
def main(): print ("note: 'ulimit -Sn 1024' if Errno 24") parser = argparse.ArgumentParser() parser.add_argument('--env', default='CartPole-v1') parser.add_argument('--seed', type=int, default=417) parser.add_argument('--n-timesteps', type=int, default=1e5) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--max-kl', type=float, default=1e-2) parser.add_argument('--log-interval', type=int, default=1e4) parser.add_argument('--save-path', default=None) parser.add_argument('--batch-size', type=int, default=1) parser.add_argument('--cuda', type=bool, default=False) parser.add_argument('--update-rule', default='A2C') args = parser.parse_args() if args.cuda: assert torch.cuda.is_available(), 'No available cuda devices' envs = [gym.make(args.env) for _ in range(args.batch_size)] set_seeds(envs, args.seed, args.cuda) agent = Agent(envs[0].observation_space, envs[0].action_space) if args.cuda: agent.cuda() rets = learn(agent, envs, args.update_rule, cuda=args.cuda, n_timesteps=args.n_timesteps, gamma=args.gamma, log_interval=args.log_interval, max_kl=args.max_kl) torch.save(rets, "./out/{}_{}".format(args.env, args.update_rule)) if not (args.save_path is None): torch.save(agent.state_dict(), args.save_path)
def init_agent(state_size, action_size, num_agents): global agent print("\nInitializing agent....") agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=RANDOM_SEED)
def test_train(): train_num_episodes = APPROX_EPISODES_PER_SECOND * DESIRED_TRAIN_NUM_SECONDS # run a quick session of training and make plots. env = Environment() agent = Agent(env) agent.train(env, num_episodes=train_num_episodes, plot_training_rewards=False) # assert the trained agent has different Q values to a freshly instantiated one. fresh_env = Environment() fresh_agent = Agent(fresh_env) assert not np.array_equal( fresh_agent._Q, agent._Q )
def main(): parser = argparse.ArgumentParser(description='LUBAN runner') register_model_args(parser) params, unparsed = parser.parse_known_args(sys.argv) sess = tf.Session() agent = Agent(sess, params) agent.train(checkpoint_dir="./checkpoint", data_dir='./data/dataset-50-3-2.hdf5')
def main(_): config = get_config(FLAGS) or FLAGS config.cnn_format = 'NHWC' ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": env = GymEnvironment(config) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): lr_op = tf.placeholder('float', None, name='learning_rate') optimizer = tf.train.RMSPropOptimizer(lr_op, decay=0.99, momentum=0, epsilon=0.1) agent = Agent(config, env, optimizer, lr_op) agent.ep_end = random.sample([0.1, 0.01, 0.5], 1)[0] print(agent.model_dir) # Create a "supervisor", which oversees the training process. is_chief = (FLAGS.task_index == 0) sv = tf.train.Supervisor(is_chief=is_chief, logdir="./logs/" + agent.model_dir, init_op=agent.init_op, summary_op=None, saver=agent.saver, global_step=agent.step_op, save_model_secs=600) if FLAGS.is_train: if is_chief: train_or_play = agent.train_with_summary else: train_or_play = agent.train else: train_or_play = agent.play with sv.managed_session(server.target) as sess: agent.sess = sess agent.update_target_q_network() train_or_play(sv, is_chief) # Ask for all the services to stop. sv.stop()
def __init__(self): self.world = World(*SimulationConfig.word_size) self.graphic = Graphic(self.world, *SimulationConfig.pane_size) if SimulationConfig.fixed_sick_cases: for i in range(SimulationConfig.population_size): if i < SimulationConfig.fixed_cases_count: self.world.add_agent_on_free(Agent(self.world, True)) else: self.world.add_agent_on_free(Agent(self.world, False)) else: for i in range(SimulationConfig.population_size): self.world.add_agent_on_free( Agent( self.world, get_it_with_probability( SimulationConfig.create_sick_agent_probability, True, False))) self.statistic = Statistic(self.world)
def initialize_agents_from_files(self, agent_directory): from src.agent import Agent agent_files = os.listdir(agent_directory) self.transition_probabilities = nx.DiGraph() for each_agent_file in agent_files: if '.xml' in each_agent_file: agent = Agent() agent_filename = agent_directory + each_agent_file agent.get_nodes_for_graph(agent_filename, self) for each_agent_file in agent_files: if '.xml' in each_agent_file: agent = Agent() agent_filename = agent_directory + each_agent_file agent.get_parameters_from_file(agent_filename, self) self.agents.append(agent)
def read_agent(version): nn = Residual_CNN(config['REG_CONST'], config['LEARNING_RATE'], (2, 4, 8), config['ACTION_SIZE'], config['HIDDEN_CNN_LAYERS'], config['MOMENTUM']) m_tmp = nn.read(version) nn.model.set_weights(m_tmp.get_weights()) player = Agent(nn, ActionEncoder(DirectionResolver()), StateEncoder(), name='player' + str(version), config=config) return player
def test_produciton(config_u_0, config_w_0): """ 1. u_duration を 0 にしたときに kaQta と知覚されるかのテスト 2. u_duration を 0 u に相当する """ src_phoneme = "kawuta" u_idx, w_idx = 1, 3 u_duration, w_duration = 9, 3 # u_0 a_u_0 = Agent(config_u_0) _, _, state_idxs_u_0 = a_u_0.production(src_phoneme) assert u_idx not in state_idxs_u_0 # このtestはaだと複数あるため成立しない assert len(list(filter(lambda e: e == w_idx, state_idxs_u_0))) == w_duration # w_0 a_w_0 = Agent(config_w_0) _, _, state_idxs_w_0 = a_w_0.production(src_phoneme) assert len(list(filter(lambda e: e == u_idx, state_idxs_w_0))) == u_duration assert w_idx not in state_idxs_w_0
def learn_policy(self, epochs, actor_iterations=1, critic_iterations=1, episodes_per_update=1, epsilon_bound=0.2) \ -> Tuple[Policy, List[float]]: policy = StochasticPolicy(self.actor) agent = Agent(self.environment, policy) r_obs = RewardObserver() t_obs = TrajectoryObserver() agent.attach_observer(t_obs) agent.attach_observer(r_obs) for _ in tqdm(range(epochs)): # TODO COLLECTING EPISODES CAN BE DONE IN PARALLEL WITH MULTIPLE AGENTS for _ in range(episodes_per_update): agent.perform_episode() reward_to_go = t_obs.reward_to_go(self.discount_factor) trajectories = t_obs.sampled_trajectories # unify trajectories into single list reward_to_go = concatenate(reward_to_go) trajectories = concatenate(trajectories) # to tensor reward_to_go = torch.tensor(reward_to_go) trajectories = list_of_tuples_to_tuple_of_tensors(trajectories) if self.use_critic: state_index = 0 v = self.critic(trajectories[state_index]) v = torch.squeeze(v, 1) advantage = reward_to_go - v advantage = advantage.detach() self.update_actor(trajectories, advantage, actor_iterations, epsilon_bound) self.update_critic(trajectories, reward_to_go, critic_iterations) else: self.update_actor(trajectories, reward_to_go, actor_iterations, epsilon_bound) # reset memory for next iteration t_obs.clear() return policy, r_obs.get_rewards()
def generate_data(mode, num_simulations=30): """ Generate the dual model data from the TEST_GRID_LIST specified above. Args: - mode (Str): "delay" or "pressure"; whether the data generated has more or fewer monte carlo iterations to solve the test grids - num_simulations (int): how many data points to generate from the model Returns: - """ agent = Agent() start = time.time() print("Starting {mode} data generation".format(mode=mode)) model_results = [ ] # item e.g. {'model':'constrained','grid_num':23,'reward':3,'best_reward':3,'id':10} # Generate dual model "time constrained scenario" for i in range(num_simulations): if mode == "pressure": n_iters = random.randrange( 0, 50 ) #choose a randome integer between 20 and 30 for MC iterations elif mode == "delay": n_iters = random.randrange( 120, 530 ) #note these ranges were chosen by looking at the dual model performance graph # in the dual_model_data_generation.ipynb for ind, grid_init in TEST_GRID_LIST: testgrid = grid.Grid(5, random=False, init_pos=grid_init) Q, policy = agent.mc_first_visit_control(testgrid.copy(), iters=n_iters, nn_init=True, cutoff=0.4) _, _, model_reward = agent.run_final_policy(testgrid.copy(), Q, nn_init=True, display=False) individual_info = { } #information for this particular model instantiation individual_info['id'] = i individual_info['model'] = mode individual_info['grid_num'] = ind individual_info['reward'] = model_reward individual_info['best_reward'] = grid_init['best_reward'] model_results.append(individual_info) print("Simulation {num} took {time} seconds".format(num=i, time=time.time() - start)) start = time.time() return model_results
def learn_policy(self, epochs=200, episodes_per_update=1): self.v_optimizer.zero_grad() state_index = 0 action_index = 1 policy = StochasticPolicy(self.a_distribution_model) agent = Agent(self.environment, policy) # utilities to collect agent data t_obs = TrajectoryObserver() r_obs = RewardObserver() agent.attach_observer(t_obs) agent.attach_observer(r_obs) for _ in tqdm(range(epochs)): for _ in range(episodes_per_update): # perform complete episode with observers attached agent.perform_episode() # collect trajectory and calculate reward to go trajectory = t_obs.last_trajectory() reward_to_go = get_reward_to_go(trajectory, self.discount_factor) # convert to pytorch tensors trajectory = list_of_tuples_to_tuple_of_tensors(trajectory) reward_to_go = torch.tensor(reward_to_go, dtype=torch.float32) advantage = self.get_advantage(trajectory, reward_to_go) # calculate loss policy_loss = self.a_distribution_model( trajectory[state_index]) policy_loss = -policy_loss.log_prob( trajectory[action_index]) * advantage policy_loss = torch.sum(policy_loss) # to estimate the expected gradient of episodes_per_update episodes, # we divide the loss by episodes_per_update policy_loss = policy_loss / episodes_per_update # accumulate gradient policy_loss.backward() # gradient step self.a_optimizer.step() self.a_optimizer.zero_grad() self.update_advantage() t_obs.clear() return policy, r_obs.get_rewards()
def initialize_agents_from_files(self, agent_directory): from src.agent import Agent agent_files = os.listdir(agent_directory) for each_agent_file in agent_files: if '.xml' in each_agent_file: agent = Agent() agent_filename = agent_directory + each_agent_file agent.get_parameters_from_file(agent_filename, self) self.agents.append(agent) agent.initialize_total_assets()
def main(): window_size = 5 episode_count = 10 stock_name = "^GSPC_2011" agent = Agent(window_size) market = Market(window_size=window_size, stock_name=stock_name) batch_size = 32 start_time = time.time() for e in range(episode_count + 1): print("Episodio" + str(e) + "/" + str(episode_count)) agent.reset() state, price_data = market.reset() # ToDo: get the initial state for t in range(market.last_data_index): # obtener acción actual del agente # llamar al método act() del agente considerando el estado actual action, bought_price = agent.act(state, price_data) # obtener siguiente estado del agente según el mercado next_state, next_price_data, reward, done =\ market.get_next_state_reward(action, bought_price) # añadir trasacción a la memoria agent.memory.append((state, action, reward, next_state, done)) # aprender de la historia solo en el caso que haya memoria if len(agent.memory) > batch_size: agent.experience_replay(batch_size) state = next_state price_data = next_price_data if done: print("--------------------------------") print("Ganancias totales: {0}".format( agent.get_total_profit())) print("--------------------------------") if e % 10 == 0: if not os.path.exists("models"): os.mkdir("models") agent.model.save("models/model_rl" + str(e)) end_time = time.time() training_time = round(end_time - start_time) print("Entrenamiento tomó {0} segundos.".format(training_time))
def cartpoloe(): training = True if training: environment = gym.make("CartPole-v1") q_model = nn.Sequential(nn.Linear(4, 24), nn.ReLU(), nn.Linear(24, 24), nn.ReLU(), nn.Linear(24, 2)) optimizer = torch.optim.Adam(q_model.parameters(), lr=0.001) learner = DQN(environment=environment, q_model=q_model, optimizer=optimizer, exploration_policy=DecayingEpsilonGreedyQPolicy( q_model, initial_epsilon=1.0, decay_factor=0.95, min_epsilon=0.05)) opt_policy, history = learner.learn_policy(episodes=200) plt.plot(history) plt.xlabel('episode') plt.ylabel('total reward') plt.savefig("score.png") # torch.save(q_model.module, "learned networks/cartpole/q_network.torch") agent = Agent(environment=environment, policy=opt_policy) input("add anything to continue") agent.perform_episode(render=True) else: environment = gym.make("CartPole-v1") q_model = torch.load("learned networks/cartpole/q_network.torch") opt_policy = GreedyQPolicy(q_model) agent = Agent(environment=environment, policy=opt_policy) agent.perform_episode(render=True)
def graph_dual_model_performance(): test_grids = TEST_GRIDS all_test_list = [(key, grid) for key, grid in test_grids.items()] sorted(all_test_list, key=lambda x: x[0]) agent = Agent() iters = ITERS total_normal_grid_score, total_grid1_score, total_grid2_score, total_grid3_score, total_grid4_score = [],[],[],[],[] repeats = REPEATS for n in iters: print("Running iteration {n}".format(n=n)) normal_grid_score, grid1_score, grid2_score, grid3_score, grid4_score = [],[],[],[],[] for ind, grid_init in all_test_list: normalized_score = 0 for j in range(repeats): grid_num = int(ind) #ind initially is a string. best_reward = grid_init['best_reward'] testgrid = Grid(5, random=False, init_pos=grid_init) Q, policy = agent.mc_first_visit_control(testgrid.copy(), iters=n, nn_init=True) _, _, dual_model_reward = agent.run_final_policy( testgrid.copy(), Q, nn_init=True, display=False) normalized_score += dual_model_reward - best_reward if grid_num < 100: normal_grid_score.append(normalized_score / repeats) elif grid_num < 200: #grid type 1 grid1_score.append(normalized_score / repeats) elif grid_num < 300: #grid type 2 grid2_score.append(normalized_score / repeats) elif grid_num < 400: #grid type 3 grid3_score.append(normalized_score / repeats) else: #grid type 4 grid4_score.append(normalized_score / repeats) total_normal_grid_score.append(np.mean(normal_grid_score)) total_grid1_score.append(np.mean(grid1_score)) total_grid2_score.append(np.mean(grid2_score)) total_grid3_score.append(np.mean(grid3_score)) total_grid4_score.append(np.mean(grid4_score)) # plt.plot(iters, total_normal_grid_score, label="normal grids", color="red") plt.plot(iters, total_grid1_score, label='push dilemma', color="blue") plt.plot(iters, total_grid2_score, label='switch dilemma', color="green") plt.plot(iters, total_grid3_score, label='switch save', color="orange") plt.plot(iters, total_grid4_score, label='push get', color="brown") plt.legend() plt.xlabel("Number of MC Iterations") plt.ylabel("Normalized Score") plt.title("Dual model performance on all test grids") plt.show()
def initialize_agents_from_files(self, agent_directory, network_config): from src.agent import Agent agent_files = os.listdir(agent_directory) self.network = nx.read_gexf(network_config) # print(self.network.edges(data=True)) for each_agent_file in agent_files: if '.xml' in each_agent_file: agent = Agent() agent_filename = agent_directory + each_agent_file agent.get_parameters_from_file(agent_filename, self) self.agents.append(agent)
def test_agent(): state_space_dim = 3 action_space_dim = 4 train = Train() agent = Agent(state_space_dim=state_space_dim, action_space_dim=action_space_dim, low_action=-1, high_action=1, load=False) state = np.random.rand((state_space_dim))[None] next_state = np.random.rand((state_space_dim))[None] action = agent.get_action(state) reward = np.array([1]) done = np.array([0]) Q_loss, policy_loss = train(agent, state, next_state, action, reward, done) assert (True)