예제 #1
0
def env_diff(env_a, env_b, iterations, step_size):
    all_states = env_a.get_all_states() + env_b.get_all_states()

    dqn_a = DQN(env_a,
                qnet=LinRegNet(64, 4).double(),
                plotter=None,
                render=False,
                memory_length=2000,
                gamma=.99,
                alpha=.001,
                epsilon_start=0.1)
    dqn_b = DQN(env_b,
                qnet=LinRegNet(64, 4).double(),
                plotter=None,
                render=False,
                memory_length=2000,
                gamma=.99,
                alpha=.001,
                epsilon_start=0.1)

    all_mean_diffs = []
    for ne in range(0, iterations):
        convergence_durations = []
        ql_agents = []

        dqn_a.train(step_size, 4, plot=False)
        dqn_b.train(step_size, 4, plot=False)

        env_diffs = []

        total_visits = []
        for state in all_states:
            state = torch.from_numpy(state)

            total_visits.append((dqn_a.state_visits[state] if state in
                                 dqn_a.state_visits else 0) +
                                (dqn_b.state_visits[state] if state in
                                 dqn_b.state_visits else 0))

        normalized_visits = {}
        for state in all_states:
            state = torch.from_numpy(state)
            normalized_visits[state] = (
                (dqn_a.state_visits[state] if state in dqn_a.state_visits else
                 0) + (dqn_b.state_visits[state] if state in dqn_b.state_visits
                       else 0)) / np.linalg.norm(total_visits)

        for state in all_states:
            state = torch.from_numpy(state)
            action_a = dqn_a.qnet(state).max(0)[1].numpy()
            action_b = dqn_b.qnet(state).max(0)[1].numpy()
            env_diffs.append(1 * normalized_visits[state] if action_a ==
                             action_b else 0)

        print('{}/{} mean difference: {:.4f}'.format(ne + 1, iterations,
                                                     np.mean(env_diffs)))
        all_mean_diffs.append(np.mean(env_diffs))

    absolutely_all_diffs.append(all_mean_diffs)
    return all_mean_diffs[-1]
예제 #2
0
def env_diff(env_a, env_b, iterations, step_size):
    all_states = env_a.get_all_states() + env_b.get_all_states()

    dqn_a = DQN(env_a,
                qnet=LinRegNet(64, 4).double(),
                plotter=None,
                render=False,
                memory_length=2000,
                gamma=.99,
                alpha=.001,
                epsilon_start=0.1)
    dqn_b = DQN(env_b,
                qnet=LinRegNet(64, 4).double(),
                plotter=None,
                render=False,
                memory_length=2000,
                gamma=.99,
                alpha=.001,
                epsilon_start=0.1)

    all_mean_diffs = []
    for ne in range(0, iterations):
        convergence_durations = []
        ql_agents = []

        dqn_a.train(step_size, 4, plot=False)
        dqn_b.train(step_size, 4, plot=False)

        env_diffs = []

        for state in all_states:
            state = torch.from_numpy(state)
            env_diffs.append(
                torch.sum((dqn_a.qnet(state) - dqn_b.qnet(state))**2).item())

        print('{}/{} mean difference: {:.4f}'.format(ne + 1, iterations,
                                                     np.mean(env_diffs)))
        all_mean_diffs.append(np.mean(env_diffs))

    absolutely_all_diffs.append(all_mean_diffs)
    return all_mean_diffs[-1]


#envs = [
#    (Gridworld(width=4, height=4, cell_size=32, agent_pos=(2, 0), food_pos=[(0, 3), (3, 3)]),
#     Gridworld(width=4, height=4, cell_size=32, agent_pos=(2, 0), food_pos=[(1, 3), (3, 3)]))
#]

#for env_pair in envs:
#    print(env_diff(env_pair[0], env_pair[1], 10, 10))

#for diff in absolutely_all_diffs:
#    plt.plot(diff)

#plt.savefig('test.png')
#plt.show()
예제 #3
0
def dqn_benchmark(env, iterations):
    histories = []
    for _ in range(iterations):
        dqn = DQN(env,
                  qnet=LinRegNet(64, 4).double(),
                  plot_durations=True,
                  plotter=None,
                  render=False,
                  memory_length=2000,
                  gamma=.99,
                  alpha=.001,
                  epsilon_start=0.1)
        dqn.train(200, 4)
        histories.append(np.mean(dqn.history))
    return histories
예제 #4
0
import gym
from vicero.algorithms.deepqlearning import DQN, NetworkSpecification
from vicero.agent import Agent

# This example is showing off multiple concepts, for a more pure
# DQN example, see mountaincar. The first part is obviously to
# solve cartpole, but in addition, the script will save the policy
# as it is after one shorter round of training, then it will train
# a while longer. At last it will show both policies in comparison.
# This demonstrates both that training actually improves performace
# as well as the concept of saving a policy.

env = gym.make('CartPole-v1')

spec = NetworkSpecification()
dqn = DQN(env, spec, render=False, caching_interval=200)

batch_size = 32
num_episodes = 4
training_iter = 500
completion_reward = -10

print('training...')
dqn.train(num_episodes,
          batch_size,
          training_iter,
          verbose=True,
          completion_reward=completion_reward,
          plot=True,
          eps_decay=True)
예제 #5
0
env_list = []

for i in range(10):
    env_list.append(Gridworld(width=4, height=4, cell_size=32, seed=(20 + i)))
#for i in [8, 0, 2, 4, 7]:
#    env_list.append(Gridworld(width=4, height=4, cell_size=32, seed=i))

env = MultitaskEnvironment(env_list)

for _ in range(repetitions):
    print('x')
    dqn = DQN(env,
              qnet=NeuralNet(64, 4).double(),
              plotter=plot,
              render=False,
              memory_length=2000,
              gamma=.99,
              alpha=.001,
              epsilon_start=0.1,
              plot_durations=True)
    dqn.train(training_iterations, 4, plot=False)
    histories_a.append(dqn.history)

#for seed in env.env_scores:
#    print('seed={}, score={}'.format(seed, np.mean(env.env_scores[seed])))

print('Set B')
histories_b = []

env_list = []
예제 #6
0
파일: train_dqn.py 프로젝트: Jontahan/kvad
        #x = F.relu(self.fc2(x))
        #x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x
"""


class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(256, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 4)

    def forward(self, x):
        x = torch.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


dqn = DQN(env,
          qnet=PolicyNet().double(),
          plotter=plot,
          render=True,
          memory_length=2000,
          gamma=.95,
          alpha=.001,
          epsilon_start=0.1)
dqn.train(2000, 4, plot=True, verbose=True)
예제 #7
0
파일: exp.py 프로젝트: Jontahan/kvad
        for i in range(1, len(layer_sizes)):
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Linear(layer_sizes[i - 1], layer_sizes[i]))
        
        self.nn = nn.Sequential(*self.layers)

    def forward(self, x):
        x = torch.flatten(x)
        return self.nn.forward(x)

gamma = .95
alpha = .002

num_episodes = 2000
convergence_durations = []
for i in range(50):
    print('Simulation {}/{}'.format(i, 50))
    net = DenseNet(64, [4]).double()
    dqn = DQN(env, qnet=net, plotter=plot, render=True, memory_length=2000, gamma=gamma, alpha=alpha, epsilon_start=0.1, caching_interval=3000)

    for e in range(num_episodes):
        dqn.train_episode(e, num_episodes, 16, plot=True, verbose=False)
        if e > 50 and np.std(dqn.history[-50:]) < 20:
            print('Early stop after {} iterations'.format(e))
            convergence_durations.append(e)
            break

    if len(convergence_durations) <= i:
        convergence_durations.append(env.cutoff)

print('mean duration: {}'.format(np.mean(convergence_durations)))
예제 #8
0
    def forward(self, x):
        x = torch.flatten(x)
        return self.linreg(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


all_states = env_a.get_all_states()
#ql_a = Qlearning(env_a, n_states=len(all_states), n_actions=env_a.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998)
#ql_b = Qlearning(env_b, n_states=len(all_states), n_actions=env_b.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998)
dqn_a = DQN(env_a,
            qnet=PolicyNet().double(),
            plotter=None,
            render=False,
            memory_length=2000,
            gamma=.99,
            alpha=.001,
            epsilon_start=0.1)
dqn_b = DQN(env_b,
            qnet=PolicyNet().double(),
            plotter=None,
            render=False,
            memory_length=2000,
            gamma=.99,
            alpha=.001,
            epsilon_start=0.1)
#dqn.train(2000, 4, plot=True, verbose=True)

for ne in range(0, 50):
    #np.random.seed(10)
예제 #9
0
pg.init()
screen = pg.display.set_mode(dim)
clock = pg.time.Clock()

env = BoatEnvironment(dim, screen, 
    intermediate_rewards=False,
    multi_agent=True,
    default_reward=0,
    illegal_penalty=-1
)

framerate = 60
running = True

spec = NetworkSpecification(hidden_layer_sizes=[14, 8], activation_function=nn.ReLU)
dqn = DQN(env, spec, render=False, gamma=0.99, alpha=1e-5, epsilon_start=0.9, epsilon_end=0.05, memory_length=5000)

batch_size = 2
num_episodes = 20
training_iter = 10

print('training...')
dqn.train(num_episodes, batch_size, training_iter, verbose=True, plot=True, eps_decay=True)
boat1_policy = dqn.copy_target_policy(verbose=False)
boat2_policy = dqn.copy_target_policy(verbose=False)
dqn.save('long_training.pkl')

boat1_state = env.reset()
boat2_state = boat1_state

plt.plot(dqn.history)
예제 #10
0
파일: clab15.py 프로젝트: Jontahan/ingstad
pg.init()
screen = pg.display.set_mode(dim)
clock = pg.time.Clock()

env = BoatEnvironment(dim, screen, intermediate_rewards=True, multi_agent=True)

framerate = 100
running = True

spec = NetworkSpecification(hidden_layer_sizes=[12],
                            activation_function=nn.Sigmoid)
dqn = DQN(env,
          spec,
          render=True,
          alpha=1e-5,
          epsilon_start=0.8,
          epsilon_end=0.01,
          memory_length=2000)

batch_size = 4
num_episodes = 100
training_iter = 1500

print('training...')
dqn.train(num_episodes,
          batch_size,
          training_iter,
          verbose=True,
          plot=True,
          eps_decay=True)
예제 #11
0
import gym
from vicero.algorithms.deepqlearning import DQN, NetworkSpecification
import matplotlib.pyplot as plt


# This function is used to design a custom reward function, overriding
# the one from the environment. This one rewards based on the absolute cart speed.
def state_to_reward(state):
    return abs(state[1]) * 10 - 0.05


env = gym.make('MountainCar-v0')
spec = NetworkSpecification()

dqn = DQN(env, spec=spec, state_to_reward=state_to_reward)

batch_size = 32
num_episodes = 200
training_iter = 500

dqn.train(num_episodes, batch_size, training_iter, verbose=True, plot=True)

plt.plot(dqn.history)
plt.show()

plt.plot(dqn.maxq_history)
plt.show()
예제 #12
0
def get_diversity(env_list,
                  learning_alg,
                  state_dist,
                  softmax=False,
                  training_iterations=1000,
                  steps=10,
                  verbose=False,
                  input_size=64):
    assert not (learning_alg == 'reinforce' and softmax)
    expert_agents = []

    for i in range(len(env_list)):
        if learning_alg == 'reinforce':
            agent = Reinforce(env_list[i],
                              polinet=LogRegNet(input_size, 4).double())
        elif learning_alg == 'dqn':
            agent = DQN(env_list[i],
                        qnet=LinRegNet(input_size, 4).double(),
                        plotter=None,
                        render=False,
                        memory_length=2000,
                        gamma=.99,
                        alpha=.001,
                        epsilon_start=0.3)

        expert_agents.append(agent)

    diversity_history = []

    for step in range(steps):
        print('step {}/{}'.format(step, steps))
        for i in range(len(env_list)):
            if verbose:
                print('Training expert agent for environment({}) {}/{}'.format(
                    env_list[i].seed, i + 1, len(env_list)))
            start = time.time()

            if learning_alg == 'reinforce':
                expert_agents[i].train(training_iterations)
            elif learning_alg == 'dqn':
                expert_agents[i].train(training_iterations, 4)

            end = time.time()
            if verbose: print('Elapsed time: {:.2f}s'.format(end - start))

        diff_list = []
        for i in range(len(env_list)):
            for j in range(i + 1, len(env_list)):
                env_a = env_list[i]
                env_b = env_list[j]
                agent_a = expert_agents[i]
                agent_b = expert_agents[j]

                if state_dist == 'full':
                    all_states = env_a.get_all_states() + env_b.get_all_states(
                    )
                elif state_dist == 'memory':
                    all_states = [
                        sample[0].numpy() for sample in agent_a.memory
                    ] + [sample[0].numpy() for sample in agent_b.memory]
                env_diffs = []

                with torch.no_grad():
                    for state in all_states:
                        state = torch.from_numpy(state)

                        if learning_alg == 'reinforce':
                            env_diffs.append(
                                torch.sum(
                                    (agent_a.policy_net(state) -
                                     agent_b.policy_net(state))**2).item())
                        elif learning_alg == 'dqn':
                            if softmax:
                                env_diffs.append(
                                    torch.sum(
                                        (F.softmax(agent_a.qnet(state), dim=-1)
                                         - F.softmax(agent_b.qnet(state),
                                                     dim=-1))**2).item())
                            else:
                                env_diffs.append(
                                    torch.sum((agent_a.qnet(state) -
                                               agent_b.qnet(state))**2).item())

                diff_list.append(np.mean(env_diffs))
        diversity_history.append(np.mean(diff_list))

    return diversity_history