예제 #1
0
def corrmaze(c_len):
    grid = np.zeros((3,c_len+2))
    for x in range(1,c_len+1):
        grid[0][x] = 1
        grid[2][x] = 1
    agents = [Agent("[255,0]",(0,0),(0,0),(c_len+1,0)),Agent("[0,255]",(c_len+1,2),(c_len+1,2),(0,2))]
    m = Maze(grid,agents)
    return m
def test_exercise(capsys):
    bond = Agent("James", "Bond")
    print(bond)

    ionic = Agent("Ionic", "Bond")
    print(ionic)

    out, err = capsys.readouterr()
    assert out == "My name is Bond, James Bond\nMy name is Bond, Ionic Bond\n"
예제 #3
0
def test_load_parameters():
    env = Environment()

    # instantiate an agent, loading in Q values from models/test
    agent = Agent(env, load_from_directory=MODELS_DIR / "test", load_N=False)
    # models/test/Q.csv is all 0's apart from a single 1 at [2, 1]
    assert agent._Q[2, 1] == 1.0

    # check that loading works in a two-step process too
    # also check that N values are correctly loaded
    agent = Agent(env)
    agent.load_parameters(MODELS_DIR / "test", load_N=True)
    assert agent._Q[2, 1] == 1.0
    assert agent._N[2, 1] == 1  # models/test/N.csv follows same pattern
예제 #4
0
def test_save_parameters():
    # create a temporary directory to save into. It gets deleted at the end of the test.
    with tempfile.TemporaryDirectory() as tmp:
        save_dir = pathlib.Path(tmp)
        env = Environment()

        agent = Agent(env)
        agent._Q[3, 1] = 1.0
        agent._N[3, 1] = 1
        agent.save_parameters(to_directory=save_dir)

        del agent
        new_agent = Agent(env, load_from_directory=save_dir)
        assert new_agent._Q[3, 1] == 1.0
        assert new_agent._N[3, 1] == 1
예제 #5
0
def run(load_path, model_path, index, load_model):

	with tf.device('/gpu:0'):
		trainer = tf.train.AdamOptimizer(learning_rate=1e-4)

		global_env = Environment(load_path=load_path, starting_index=index, final_index=index+1)
		global_net = A3C_Network()
		agent = Agent(0, global_net.n_inputs_policy, global_net.n_inputs_matching,
						global_net.n_actions_policy, trainer, load_path, model_path)

		saver = tf.train.Saver()

	with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
		
		coord = tf.train.Coordinator()

		if load_model:
			ckpt = tf.train.get_checkpoint_state(model_path)
			saver.restore(sess, ckpt.model_checkpoint_path)
			sess.run(global_env.index.assign(index))
			sess.run(global_env.final_index.assign(index+1))
		else:
			sess.run(tf.global_variables_initializer())

		agent_test = lambda: agent.test(sess, coord)
		t = threading.Thread(target=(agent_test))
		t.start()
		t.join()
def main(test):
    # init the environment
    env = UnityEnvironment(file_name=REACHER_APP)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    # number of actions
    action_size = brain.vector_action_space_size
    # dimenison of the state space
    state_size = env_info.vector_observations.shape[1]
    # number of agents
    n_agents = len(env_info.agents)

    # create a DDPG agent
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  n_agents=n_agents,
                  random_seed=1)
    if not test:
        # train the agent
        scores = run_agent(env, agent, n_episodes=300)
        _ = plot_scores(agent, scores)
    else:
        # test the trained agent
        # load the weights from file
        agent.actor_local.load_state_dict(
            torch.load(f'weights/{str(agent)}_checkpoint_actor.pth'))
        agent.critic_local.load_state_dict(
            torch.load(f'weights/{str(agent)}_checkpoint_critic.pth'))
        test_agent(env, agent, n_agents)

    env.close()
예제 #7
0
    def train(self, log_file_dir = "./tensorboard", index = "0"):

        self.__print_upperbound()
        self.__init_tensorboard(log_file_dir)
        
        starttime = time.time()
        total_data_time = 0
        total_training_time = 0
        for i in range(int(self.train_config['steps'])):
            step_start = time.time()
            X, w, y, setw = self.next_batch()
            finish_data = time.time()
            total_data_time += (finish_data - step_start)
            self._agent.train_step(X, w, y, setw=setw)
            total_training_time += time.time() - finish_data 
            if i % 1000 == 0 and log_file_dir:
                logging.info("average time for data accessing is %s"%(total_data_time/1000))
                logging.info("average time for training is %s"%(total_training_time/1000))
                total_training_time = 0
                total_data_time = 0
                self.log_between_steps(i)
            
        if self.save_path:
            best_agent = Agent(self.config, restore_dir=self.save_path)
            self._agent = best_agent

        pv_vector, loss, output = self._evaluate("test")
        pv = self._agent.portfolio_value
        log_mean = self._agent.log_mean
        logging.warning('the portfolio value train No.%s is %s log_mean is %s,'
                        ' the training time is %d seconds' % (index, pv, log_mean, time.time() - starttime))
예제 #8
0
def main():
    """
    This function will be called for training phase.
    """
    # Sample code for illustration, add your code below to run in test phase.
    # Load trained model from train/ directory
    env = gym.make(MINERL_GYM_ENV)
    if FRAME_SKIP > 0:
        env = FrameSkip(env, enable_rendering=True)
    env = ObsWrapper(env)
    env = MoveAxisWrapper(env, -1, 0)
    env = CombineActionWrapper(env)

    agent = Agent(env.observation_space, env.action_space)
    agent.load_model()

    for _ in range(MINERL_MAX_EVALUATION_EPISODES):
        obs = env.reset()
        done = False
        netr = 0
        while not done:
            action = agent.act(obs)
            obs, reward, done, info = env.step(action)
            netr += reward
            env.render()

    env.close()
예제 #9
0
def pl_grad():
    print("hi")
    environment = gym.make("CartPole-v1")
    net = nn.Sequential(nn.Linear(4, 40, bias=False), nn.ReLU(),
                        nn.Linear(40, 2, bias=False), nn.Softmax(dim=1))

    class distributionNet(nn.Module):
        def __init__(self):
            super(distributionNet, self).__init__()
            self.net = net

        def forward(self, x):
            return Categorical(self.net(x))

    a_model = distributionNet()
    optimizer = torch.optim.Adam(a_model.parameters(), lr=0.01)
    learner = PolicyGradient(environment,
                             a_model,
                             optimizer,
                             discount_factor=0.99)
    opt_policy, history = learner.learn_policy(epochs=500,
                                               episodes_per_update=1)

    plt.plot(history)
    plt.xlabel('episode')
    plt.ylabel('total reward')
    plt.savefig("score.png")

    agent = Agent(environment=environment, policy=opt_policy)
    input("add anything to continue")
    agent.perform_episode(render=True)
예제 #10
0
    def __init__(self,
                 config,
                 agent=None,
                 save_path=None,
                 restore_dir=None,
                 device="cpu"):
        self.config = config
        self.train_config = config["training"]
        self.input_config = config["input"]
        self.best_metric = 0
        self.save_path = save_path

        self._matrix = DataMatrices.create_from_config(config)
        self.time_index = self._matrix._DataMatrices__global_data.time_index.values
        self.coins = self._matrix._DataMatrices__global_data.coins.values
        self.test_set = self._matrix.get_test_set()
        self.training_set = self._matrix.get_training_set()

        tf.random.set_seed(self.config["random_seed"])
        self.device = device
        self._agent = Agent(config,
                            time_index=self.time_index,
                            coins=self.coins,
                            restore_dir=restore_dir)

        self.keras_test = self._matrix.keras_batch(data="test")
예제 #11
0
def plot_grid_2_mc():
    test_grids = TEST_GRIDS
    all_test_list = [(key, grid) for key, grid in test_grids.items()]
    sorted(all_test_list, key=lambda x: x[0])
    agent = Agent()
    iters = ITERS
    total_normal_grid_score, total_grid1_score, total_grid2_score, total_grid3_score, total_grid4_score = [],[],[],[],[]
    repeats = REPEATS
    # for n in iters:
    #   print("Running iteration {n}".format(n=n))
    grid2_score, grid4_score = [], []
    for ind, grid_init in all_test_list:
        normalized_score = 0
        for j in range(repeats):
            grid_num = int(ind)  #ind initially is a string.
            if (grid_num < 200) or (grid_num > 300):
                continue

            best_reward = grid_init['best_reward']
            testgrid = Grid(5, random=False, init_pos=grid_init)
            if grid_num in {204, 208}:
                Q, policy = agent.mc_first_visit_control(testgrid.copy(),
                                                         iters=500)
                _, _, mc_reward = agent.run_final_policy(testgrid.copy(),
                                                         Q,
                                                         display=True)
            else:
                continue
            normalized_score += mc_reward - best_reward
            if normalized_score != 0:
                print(
                    "Grid num {0} did not achieve best score".format(grid_num))
예제 #12
0
    def from_image(file):
        img = Image.open(file)
        arr = np.array(img)
        height, width, _ = arr.shape
        grid = np.zeros((height, width), dtype=np.uint8)
        for y in range(height):
            for x in range(width):
                if list(arr[y][x]) == [0, 0, 0, 255]:
                    grid[y][x] = 1

        agents = AgentPool()
        for y in range(height):
            for x in range(width):
                if arr[y][x][2] == 16:
                    name = Maze.name_from_pixel(arr[y][x])
                    agents.add(Agent(name))
                    agents.get(name).set_start((x, y))

        for y in range(height):
            for x in range(width):
                if arr[y][x][2] == 80:
                    name = Maze.name_from_pixel(arr[y][x])
                    agents.get(name).add_waypoint((x, y))
                if arr[y][x][2] == 128:
                    name = Maze.name_from_pixel(arr[y][x])
                    agents.get(name).goal = (x, y)

        return Maze(grid, agents)
예제 #13
0
def main():
    print ("note: 'ulimit -Sn 1024' if Errno 24")
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', default='CartPole-v1')
    parser.add_argument('--seed', type=int, default=417)
    parser.add_argument('--n-timesteps', type=int, default=1e5)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--max-kl', type=float, default=1e-2)
    parser.add_argument('--log-interval', type=int, default=1e4)
    parser.add_argument('--save-path', default=None)
    parser.add_argument('--batch-size', type=int, default=1)
    parser.add_argument('--cuda', type=bool, default=False)
    parser.add_argument('--update-rule', default='A2C')
    args = parser.parse_args()

    if args.cuda:
        assert torch.cuda.is_available(), 'No available cuda devices'

    envs = [gym.make(args.env) for _ in range(args.batch_size)]
    set_seeds(envs, args.seed, args.cuda)

    agent = Agent(envs[0].observation_space, envs[0].action_space)
    if args.cuda:
        agent.cuda()

    rets = learn(agent, envs, args.update_rule, cuda=args.cuda, n_timesteps=args.n_timesteps, gamma=args.gamma,
          log_interval=args.log_interval, max_kl=args.max_kl)

    torch.save(rets, "./out/{}_{}".format(args.env, args.update_rule))

    if not (args.save_path is None):
        torch.save(agent.state_dict(), args.save_path)
예제 #14
0
def init_agent(state_size, action_size, num_agents):
    global agent

    print("\nInitializing agent....")
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  num_agents=num_agents,
                  random_seed=RANDOM_SEED)
예제 #15
0
def test_train():
    train_num_episodes = APPROX_EPISODES_PER_SECOND * DESIRED_TRAIN_NUM_SECONDS

    # run a quick session of training and make plots.
    env = Environment()
    agent = Agent(env)
    agent.train(env,
                num_episodes=train_num_episodes,
                plot_training_rewards=False)

    # assert the trained agent has different Q values to a freshly instantiated one.
    fresh_env = Environment()
    fresh_agent = Agent(fresh_env)
    assert not np.array_equal(
        fresh_agent._Q,
        agent._Q
    )
예제 #16
0
def main():
    parser = argparse.ArgumentParser(description='LUBAN runner')
    register_model_args(parser)
    params, unparsed = parser.parse_known_args(sys.argv)
    sess = tf.Session()
    agent = Agent(sess, params)
    agent.train(checkpoint_dir="./checkpoint",
                data_dir='./data/dataset-50-3-2.hdf5')
예제 #17
0
def main(_):
    config = get_config(FLAGS) or FLAGS
    config.cnn_format = 'NHWC'

    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")

    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)

    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":
        env = GymEnvironment(config)

        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % FLAGS.task_index,
                    cluster=cluster)):
            lr_op = tf.placeholder('float', None, name='learning_rate')
            optimizer = tf.train.RMSPropOptimizer(lr_op,
                                                  decay=0.99,
                                                  momentum=0,
                                                  epsilon=0.1)
            agent = Agent(config, env, optimizer, lr_op)

            agent.ep_end = random.sample([0.1, 0.01, 0.5], 1)[0]

        print(agent.model_dir)

        # Create a "supervisor", which oversees the training process.
        is_chief = (FLAGS.task_index == 0)
        sv = tf.train.Supervisor(is_chief=is_chief,
                                 logdir="./logs/" + agent.model_dir,
                                 init_op=agent.init_op,
                                 summary_op=None,
                                 saver=agent.saver,
                                 global_step=agent.step_op,
                                 save_model_secs=600)

        if FLAGS.is_train:
            if is_chief:
                train_or_play = agent.train_with_summary
            else:
                train_or_play = agent.train
        else:
            train_or_play = agent.play

        with sv.managed_session(server.target) as sess:
            agent.sess = sess
            agent.update_target_q_network()

            train_or_play(sv, is_chief)

    # Ask for all the services to stop.
    sv.stop()
    def __init__(self):
        self.world = World(*SimulationConfig.word_size)
        self.graphic = Graphic(self.world, *SimulationConfig.pane_size)

        if SimulationConfig.fixed_sick_cases:
            for i in range(SimulationConfig.population_size):
                if i < SimulationConfig.fixed_cases_count:
                    self.world.add_agent_on_free(Agent(self.world, True))
                else:
                    self.world.add_agent_on_free(Agent(self.world, False))
        else:
            for i in range(SimulationConfig.population_size):
                self.world.add_agent_on_free(
                    Agent(
                        self.world,
                        get_it_with_probability(
                            SimulationConfig.create_sick_agent_probability,
                            True, False)))
        self.statistic = Statistic(self.world)
예제 #19
0
    def initialize_agents_from_files(self, agent_directory):

        from src.agent import Agent
        agent_files = os.listdir(agent_directory)

        self.transition_probabilities = nx.DiGraph()

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_nodes_for_graph(agent_filename, self)

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_parameters_from_file(agent_filename, self)
                self.agents.append(agent)
예제 #20
0
파일: gui.py 프로젝트: evgeniy44/mcts
def read_agent(version):
    nn = Residual_CNN(config['REG_CONST'], config['LEARNING_RATE'], (2, 4, 8),
                      config['ACTION_SIZE'], config['HIDDEN_CNN_LAYERS'],
                      config['MOMENTUM'])
    m_tmp = nn.read(version)
    nn.model.set_weights(m_tmp.get_weights())
    player = Agent(nn,
                   ActionEncoder(DirectionResolver()),
                   StateEncoder(),
                   name='player' + str(version),
                   config=config)
    return player
예제 #21
0
def test_produciton(config_u_0, config_w_0):
    """
    1. u_duration を 0 にしたときに kaQta と知覚されるかのテスト
    2. u_duration を 0 u に相当する
    """
    src_phoneme = "kawuta"
    u_idx, w_idx = 1, 3
    u_duration, w_duration = 9, 3
    # u_0
    a_u_0 = Agent(config_u_0)
    _, _, state_idxs_u_0 = a_u_0.production(src_phoneme)
    assert u_idx not in state_idxs_u_0
    # このtestはaだと複数あるため成立しない
    assert len(list(filter(lambda e: e == w_idx,
                           state_idxs_u_0))) == w_duration
    # w_0
    a_w_0 = Agent(config_w_0)
    _, _, state_idxs_w_0 = a_w_0.production(src_phoneme)
    assert len(list(filter(lambda e: e == u_idx,
                           state_idxs_w_0))) == u_duration
    assert w_idx not in state_idxs_w_0
예제 #22
0
    def learn_policy(self,
                     epochs,
                     actor_iterations=1,
                     critic_iterations=1,
                     episodes_per_update=1,
                     epsilon_bound=0.2) \
            -> Tuple[Policy, List[float]]:

        policy = StochasticPolicy(self.actor)

        agent = Agent(self.environment, policy)
        r_obs = RewardObserver()
        t_obs = TrajectoryObserver()
        agent.attach_observer(t_obs)
        agent.attach_observer(r_obs)

        for _ in tqdm(range(epochs)):
            # TODO COLLECTING EPISODES CAN BE DONE IN PARALLEL WITH MULTIPLE AGENTS
            for _ in range(episodes_per_update):
                agent.perform_episode()

            reward_to_go = t_obs.reward_to_go(self.discount_factor)
            trajectories = t_obs.sampled_trajectories

            # unify trajectories into single list
            reward_to_go = concatenate(reward_to_go)
            trajectories = concatenate(trajectories)

            # to tensor
            reward_to_go = torch.tensor(reward_to_go)
            trajectories = list_of_tuples_to_tuple_of_tensors(trajectories)

            if self.use_critic:
                state_index = 0

                v = self.critic(trajectories[state_index])
                v = torch.squeeze(v, 1)

                advantage = reward_to_go - v
                advantage = advantage.detach()

                self.update_actor(trajectories, advantage, actor_iterations,
                                  epsilon_bound)
                self.update_critic(trajectories, reward_to_go,
                                   critic_iterations)
            else:
                self.update_actor(trajectories, reward_to_go, actor_iterations,
                                  epsilon_bound)

            # reset memory for next iteration
            t_obs.clear()

        return policy, r_obs.get_rewards()
예제 #23
0
def generate_data(mode, num_simulations=30):
    """
    Generate the dual model data from the TEST_GRID_LIST specified above.

    Args:
        - mode (Str): "delay" or "pressure"; whether the data generated has more
            or fewer monte carlo iterations to solve the test grids
        - num_simulations (int): how many data points to generate from the model
    Returns:
        -
    """
    agent = Agent()
    start = time.time()
    print("Starting {mode} data generation".format(mode=mode))
    model_results = [
    ]  # item e.g. {'model':'constrained','grid_num':23,'reward':3,'best_reward':3,'id':10}
    # Generate dual model "time constrained scenario"
    for i in range(num_simulations):
        if mode == "pressure":
            n_iters = random.randrange(
                0, 50
            )  #choose a randome integer between 20 and 30 for MC iterations
        elif mode == "delay":
            n_iters = random.randrange(
                120, 530
            )  #note these ranges were chosen by looking at the dual model performance graph
            # in the dual_model_data_generation.ipynb

        for ind, grid_init in TEST_GRID_LIST:
            testgrid = grid.Grid(5, random=False, init_pos=grid_init)
            Q, policy = agent.mc_first_visit_control(testgrid.copy(),
                                                     iters=n_iters,
                                                     nn_init=True,
                                                     cutoff=0.4)
            _, _, model_reward = agent.run_final_policy(testgrid.copy(),
                                                        Q,
                                                        nn_init=True,
                                                        display=False)
            individual_info = {
            }  #information for this particular model instantiation
            individual_info['id'] = i
            individual_info['model'] = mode
            individual_info['grid_num'] = ind
            individual_info['reward'] = model_reward
            individual_info['best_reward'] = grid_init['best_reward']
            model_results.append(individual_info)
        print("Simulation {num} took {time} seconds".format(num=i,
                                                            time=time.time() -
                                                            start))
        start = time.time()

    return model_results
예제 #24
0
    def learn_policy(self, epochs=200, episodes_per_update=1):
        self.v_optimizer.zero_grad()
        state_index = 0
        action_index = 1

        policy = StochasticPolicy(self.a_distribution_model)
        agent = Agent(self.environment, policy)

        # utilities to collect agent data
        t_obs = TrajectoryObserver()
        r_obs = RewardObserver()
        agent.attach_observer(t_obs)
        agent.attach_observer(r_obs)

        for _ in tqdm(range(epochs)):
            for _ in range(episodes_per_update):
                # perform complete episode with observers attached
                agent.perform_episode()

                # collect trajectory and calculate reward to go
                trajectory = t_obs.last_trajectory()
                reward_to_go = get_reward_to_go(trajectory,
                                                self.discount_factor)

                # convert to pytorch tensors
                trajectory = list_of_tuples_to_tuple_of_tensors(trajectory)
                reward_to_go = torch.tensor(reward_to_go, dtype=torch.float32)

                advantage = self.get_advantage(trajectory, reward_to_go)

                # calculate loss
                policy_loss = self.a_distribution_model(
                    trajectory[state_index])
                policy_loss = -policy_loss.log_prob(
                    trajectory[action_index]) * advantage
                policy_loss = torch.sum(policy_loss)

                # to estimate the expected gradient of episodes_per_update episodes,
                # we divide the loss by episodes_per_update
                policy_loss = policy_loss / episodes_per_update

                # accumulate gradient
                policy_loss.backward()

            # gradient step
            self.a_optimizer.step()
            self.a_optimizer.zero_grad()
            self.update_advantage()

            t_obs.clear()

        return policy, r_obs.get_rewards()
예제 #25
0
    def initialize_agents_from_files(self, agent_directory):

        from src.agent import Agent
        agent_files = os.listdir(agent_directory)

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_parameters_from_file(agent_filename, self)
                self.agents.append(agent)
                agent.initialize_total_assets()
예제 #26
0
def main():

    window_size = 5
    episode_count = 10
    stock_name = "^GSPC_2011"

    agent = Agent(window_size)
    market = Market(window_size=window_size, stock_name=stock_name)

    batch_size = 32

    start_time = time.time()
    for e in range(episode_count + 1):
        print("Episodio" + str(e) + "/" + str(episode_count))
        agent.reset()
        state, price_data = market.reset()  # ToDo: get the initial state

        for t in range(market.last_data_index):
            # obtener acción actual del agente
            # llamar al método act() del agente considerando el estado actual
            action, bought_price = agent.act(state, price_data)

            # obtener siguiente estado del agente según el mercado
            next_state, next_price_data, reward, done =\
                market.get_next_state_reward(action, bought_price)

            # añadir trasacción a la memoria
            agent.memory.append((state, action, reward, next_state, done))
            # aprender de la historia solo en el caso que haya memoria
            if len(agent.memory) > batch_size:
                agent.experience_replay(batch_size)

            state = next_state
            price_data = next_price_data

            if done:
                print("--------------------------------")
                print("Ganancias totales: {0}".format(
                    agent.get_total_profit()))
                print("--------------------------------")

        if e % 10 == 0:
            if not os.path.exists("models"):
                os.mkdir("models")
            agent.model.save("models/model_rl" + str(e))

    end_time = time.time()
    training_time = round(end_time - start_time)
    print("Entrenamiento tomó {0} segundos.".format(training_time))
예제 #27
0
def cartpoloe():
    training = True

    if training:
        environment = gym.make("CartPole-v1")
        q_model = nn.Sequential(nn.Linear(4, 24), nn.ReLU(), nn.Linear(24, 24),
                                nn.ReLU(), nn.Linear(24, 2))

        optimizer = torch.optim.Adam(q_model.parameters(), lr=0.001)

        learner = DQN(environment=environment,
                      q_model=q_model,
                      optimizer=optimizer,
                      exploration_policy=DecayingEpsilonGreedyQPolicy(
                          q_model,
                          initial_epsilon=1.0,
                          decay_factor=0.95,
                          min_epsilon=0.05))

        opt_policy, history = learner.learn_policy(episodes=200)
        plt.plot(history)
        plt.xlabel('episode')
        plt.ylabel('total reward')
        plt.savefig("score.png")
        # torch.save(q_model.module, "learned networks/cartpole/q_network.torch")

        agent = Agent(environment=environment, policy=opt_policy)
        input("add anything to continue")
        agent.perform_episode(render=True)

    else:
        environment = gym.make("CartPole-v1")
        q_model = torch.load("learned networks/cartpole/q_network.torch")
        opt_policy = GreedyQPolicy(q_model)
        agent = Agent(environment=environment, policy=opt_policy)
        agent.perform_episode(render=True)
예제 #28
0
def graph_dual_model_performance():
    test_grids = TEST_GRIDS
    all_test_list = [(key, grid) for key, grid in test_grids.items()]
    sorted(all_test_list, key=lambda x: x[0])
    agent = Agent()
    iters = ITERS
    total_normal_grid_score, total_grid1_score, total_grid2_score, total_grid3_score, total_grid4_score = [],[],[],[],[]
    repeats = REPEATS
    for n in iters:
        print("Running iteration {n}".format(n=n))
        normal_grid_score, grid1_score, grid2_score, grid3_score, grid4_score = [],[],[],[],[]
        for ind, grid_init in all_test_list:
            normalized_score = 0
            for j in range(repeats):
                grid_num = int(ind)  #ind initially is a string.
                best_reward = grid_init['best_reward']
                testgrid = Grid(5, random=False, init_pos=grid_init)
                Q, policy = agent.mc_first_visit_control(testgrid.copy(),
                                                         iters=n,
                                                         nn_init=True)
                _, _, dual_model_reward = agent.run_final_policy(
                    testgrid.copy(), Q, nn_init=True, display=False)
                normalized_score += dual_model_reward - best_reward
            if grid_num < 100:
                normal_grid_score.append(normalized_score / repeats)
            elif grid_num < 200:  #grid type 1
                grid1_score.append(normalized_score / repeats)
            elif grid_num < 300:  #grid type 2
                grid2_score.append(normalized_score / repeats)
            elif grid_num < 400:  #grid type 3
                grid3_score.append(normalized_score / repeats)
            else:  #grid type 4
                grid4_score.append(normalized_score / repeats)
        total_normal_grid_score.append(np.mean(normal_grid_score))
        total_grid1_score.append(np.mean(grid1_score))
        total_grid2_score.append(np.mean(grid2_score))
        total_grid3_score.append(np.mean(grid3_score))
        total_grid4_score.append(np.mean(grid4_score))
    # plt.plot(iters, total_normal_grid_score, label="normal grids", color="red")
    plt.plot(iters, total_grid1_score, label='push dilemma', color="blue")
    plt.plot(iters, total_grid2_score, label='switch dilemma', color="green")
    plt.plot(iters, total_grid3_score, label='switch save', color="orange")
    plt.plot(iters, total_grid4_score, label='push get', color="brown")
    plt.legend()
    plt.xlabel("Number of MC Iterations")
    plt.ylabel("Normalized Score")
    plt.title("Dual model performance on all test grids")
    plt.show()
예제 #29
0
    def initialize_agents_from_files(self, agent_directory, network_config):

        from src.agent import Agent
        agent_files = os.listdir(agent_directory)

        self.network = nx.read_gexf(network_config)

        # print(self.network.edges(data=True))

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_parameters_from_file(agent_filename, self)
                self.agents.append(agent)
예제 #30
0
def test_agent():
    state_space_dim = 3
    action_space_dim = 4
    train = Train()
    agent = Agent(state_space_dim=state_space_dim,
                  action_space_dim=action_space_dim,
                  low_action=-1,
                  high_action=1,
                  load=False)
    state = np.random.rand((state_space_dim))[None]
    next_state = np.random.rand((state_space_dim))[None]
    action = agent.get_action(state)
    reward = np.array([1])
    done = np.array([0])
    Q_loss, policy_loss = train(agent, state, next_state, action, reward, done)
    assert (True)