Exemplo n.º 1
0
    def __init__(self, kwargs):
        kwargs["env_cls"] = Atari
        env = kwargs["env_cls"](kwargs["env_id"])
        kwargs["state_shape"] = env.observation_space.shape
        kwargs["state_dtype"] = np.uint8
        kwargs["n_actions"] = env.action_space.n
        kwargs["device"] = torch.device(kwargs["device_id"])
        env.close()
        self.__dict__.update(kwargs)
        self.agent = DQNAgent(**kwargs)
        self.writer = SummaryWriter("./log/")
        self.cuda_eval = torch.cuda.Stream(self.device)

        mem_kwargs = dict(
            capacity=self.mem_capacity,
            history_len=self.history_len,
            state_shape=self.state_shape,
            state_dtype=self.state_dtype,
            batch_sz=self.batch_sz,
            alpha=self.mem_alpha,
            beta=LinearScheduler(self.mem_beta, 1., self.train_steps),
            priority_eps=self.mem_priority_eps,
            priority_upper=self.mem_priority_upper,
            prioritized_replay=self.prioritized_replay,
            device=self.device,
        )
        mem_cls = PrioritizedReplayMemory if self.prioritized_replay else UniformReplayMemory
        self.mem = mem_cls(**mem_kwargs)
        self.mem_lock = Lock()
        self.sync = Queue(maxsize=1)
        self.sync.put(None)
Exemplo n.º 2
0
    def __init__(self, width, height, rows, window, offx, offy, idx=""):
        self.SETTINGS = {}
        self.SETTINGS['w'] = width
        self.SETTINGS['h'] = height
        self.SETTINGS['r'] = rows
        self.SETTINGS['sB'] = width // rows
        self.SETTINGS['ox'] = offx * width
        self.SETTINGS['oy'] = offy * height
        self.idx = idx

        self.window = window

        self.snake = Snake((255, 0, 0),
                           (self.SETTINGS['r'] // 2, self.SETTINGS['r'] // 2),
                           self.SETTINGS)
        self.snack = Cube(self.randomSnack(), self.SETTINGS, color=(0, 255, 0))

        self.dist = self.get_snack_distance()

        self.walls = self.get_wall_pos()

        self.model = Model(len(self.get_observation()), 4)
        self.tgt = Model(len(self.get_observation()), 4)
        self.agent = DQNAgent(self.model, self.tgt)
        self.reward = 0.0
        self.setp_reward = 0.0
        self.rewards = []
        self.finished = False

        self.points = 0
        self.points_ls = []
Exemplo n.º 3
0
def main():
    USE_CUDA = torch.cuda.is_available()

    env = gym.make('CartPole-v0')
    dqn = DQN(env.observation_space.shape[0], env.action_space.n)
    if USE_CUDA:
        dqn = dqn.cuda()
    optimizer = optim.RMSprop(dqn.parameters(),
                              lr=0.00025,
                              momentum=0.95,
                              alpha=0.95,
                              eps=0.01)
    epsilon_schedule = get_epsilon_schedule(start=1.0,
                                            end=0.01,
                                            endt=1000,
                                            learn_start=50)
    replay_buffer = ReplayBuffer(capacity=1000)
    agent = DQNAgent(env,
                     dqn,
                     optimizer,
                     epsilon_schedule,
                     replay_buffer,
                     discount_factor=0.99,
                     target_update_rate=10,
                     batch_size=32,
                     learn_start=50)

    agent.train(5000)
    total_reward = agent.play(render=True)
    agent.env.close()
    print('Total Reward: ', total_reward)
Exemplo n.º 4
0
def td_learning(args):
    agent = DQNAgent(args)
    replay_memory = PrioritizedReplayBuffer(1000000, args.alpha)
    #eval_game(agent, 500)
    outer = tqdm(range(args.total_steps), desc='Total steps', position=0)
    game = init_game()
    ave_score = 0
    count = 0
    for step in outer:
        board = copy.deepcopy(game.gameboard.board)
        if step < args.start_learn:
            avail_choices = game.gameboard.get_available_choices()
            index = np.random.randint(len(avail_choices))
            choice = avail_choices[index]
        else:
            choice = agent.greedy_policy(
                board, game.gameboard.get_available_choices())

        next_board, reward = game.input_pos(choice[0], choice[1])
        next_board = copy.deepcopy(next_board)
        #####

        replay_memory.add(board, choice, reward, next_board)
        #####
        if game.termination():
            ave_score += game.gameboard.score
            count += 1
            game = init_game()

        if step >= args.start_learn and step % args.train_freq == 0:
            if count > 0:
                message = "ave score of " + str(count) + " game: " + str(
                    ave_score / count)
                out_fd.write("{} {}\n".format(step, ave_score / count))
                outer.write(message)
                ave_score = 0
                count = 0
            if step == args.start_learn:
                experience = replay_memory.sample(args.start_learn,
                                                  beta=agent.beta)
            else:
                experience = replay_memory.sample(args.train_data_size,
                                                  beta=agent.beta)

            boards, choices, rewards, next_boards, weights, batch_idxes = experience

            td_errors = agent.train(
                (boards, choices, rewards, next_boards, weights))
            new_priorities = np.abs(td_errors) + prioritized_replay_eps
            replay_memory.update_priorities(batch_idxes, new_priorities)

            agent.update_target(args.soft_tau)
            agent.update_epsilon()
            agent.update_beta()

    eval_game(agent, 500)
    out_fd.close()
Exemplo n.º 5
0
def main():
    epi_file = open('../files/episode.txt')
    episode = epi_file.readline()
    epi_file.close()
    episode = int(episode) - 1
    qagent = DQNAgent(episode - 1)
    qagent.load_memory_of_episode(episode)
    qys = []
    qds = []
    for k in range(50):
        for j in range(5):
            # for i in range(0,len(qagent.memory),qagent.batch_size):
            qy, qd = qagent.memory_replay()
        qagent.update_targer_model()
        qys.append(qy)
        qds.append(qd)
    qagent.save_model(episode)
    res = time.strftime('%Y/%m/%d-%H:%M:%S', time.localtime(
        time.time())) + "Average of episode: %d Q_y: %f Q_d: %f" % (
            episode, np.mean(qys), np.mean(qds))
    epi_file = open('../files/avg_Q.txt', 'a')
    epi_file.write(res + '\n')
    epi_file.close()

    if forward:
        epi_file = open('../files/episode.txt', 'w')
        epi_file.write(str(episode + 2))
        epi_file.close()
Exemplo n.º 6
0
def test_target_model():
    agent = DQNAgent()
    agent.load('models/model.h5')
    state = np.zeros([6, 7])
    state[5][3] = 1
    state = state.reshape(1, 6, 7, 1)
    p1 = agent.policy_model.predict(state)
    p2 = agent.target_model.predict(state)
    print(p1)
    print(p2)
    if not np.array_equal(p1, p2):
        print('FAIL')
def dqn_run(episodes=2500,
            eps_start=1.0,
            eps_end=0.01,
            eps_decay=0.995,
            double_dqn=False,
            dueling_dqn=False,
            seed=42):
    env = start_env()
    env_info = reset_env_info(env)

    state_size = get_state_size(env_info)
    action_size = get_action_size(env)

    print('Seed used:', seed)
    agent = DQNAgent(state_size, action_size, double_dqn, dueling_dqn, seed)

    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start

    for episode in range(1, episodes + 1):
        env_info = reset_env_info(env)
        score = 0.0
        done = False
        while not done:
            state = env_info.vector_observations[0]
            action = agent.act(state, epsilon=eps)
            env_info = env_step(env, action)
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            score += reward

        scores_window.append(score)
        scores.append(score)
        eps = max(eps * eps_decay, eps_end)
        print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'.format(
            episode, episodes, np.mean(scores_window), eps),
              end='     ')
        if episode % 100 == 0:
            print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'.
                  format(episode, episodes, np.mean(scores_window), eps))
        if np.mean(scores_window) > 13.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break

    env.close()
    return scores
Exemplo n.º 8
0
def test_model(filename):
	env = gym.make("CartPole-v1")
	agent = DQNAgent(4, 2)
	agent.load_model(filename)

	state = env.reset()

	for _ in range(1000):
		env.render()
		state, _, done, _ = env.step(agent.act(state, explore=False))
		if done:
			break

	env.close()
    def __init__(self):
        pygame.init()
        self.window = pygame.display.set_mode((500, 800))
        pygame.display.set_caption("Racing AI")

        self.clock = pygame.time.Clock()
        self.execute = True

        self.car = Car(250, 650, self.window)
        self.agent = DQNAgent(inputs=4, n_actions=2)
        self.episode_durations = []

        self.update_agent = pygame.USEREVENT + 1
        update_timer = 100
        pygame.time.set_timer(self.update_agent, update_timer)
Exemplo n.º 10
0
def test(agent: DQNAgent, test_eps):
    env = gym.make(ENV_NAME)
    ep_rewards = []

    for test_ep in range(test_eps):
        obs = env.reset()
        done = False

        ep_reward = 0
        ep_step = 0

        while not done:

            action = agent.act(np.array(obs), evaluate=True)
            next_obs, reward, done, _ = env.step(action)
            env.render()

            obs = next_obs

            ep_reward += reward
            ep_step += 1

        ep_rewards.append(ep_reward)
        time.sleep(0.2)

    print('\n')
    print('=== Test performance ===')
    print(f'Mean: {np.mean(ep_rewards):.1f} / '
          f'Min: {np.min(ep_rewards):.1f} / '
          f'Max: {np.max(ep_rewards):.1f}')

    env.close()
    return ep_rewards
Exemplo n.º 11
0
def play(**kwargs):
    env = BananaEnvironment(file_name=kwargs['env_file'],
                            num_stacked_frames=kwargs['num_stacked_frames'])
    agent_name = kwargs['agent_fname']
    is_per = 'PER' in agent_name
    if 'ddqn' in agent_name:
        agent = DDQNAgentPER.load(agent_name) if is_per else DDQNAgent.load(
            agent_name)
    elif 'dqn' in agent_name:
        agent = DQNAgentPER.load(agent_name) if is_per else DQNAgent.load(
            agent_name)
    else:
        raise KeyError('Unknown agent type')

    for i in range(kwargs['num_plays']):
        done = False
        score = 0
        state = env.reset(train_mode=False)
        while not done:
            action = agent.act(state, eps=0.)
            state, reward, done = env.step(action)  # roll out transition
            score += reward
            print("\r play #{}, reward: {} | score: {}".format(
                i + 1, reward, score),
                  end='')
        print()
Exemplo n.º 12
0
def run(novis, env_dir, env_file, n_episodes, seed, prioritized, cpu):
    if novis:
        env_dir = "{}_NoVis".format(env_dir)

    env = UnityEnvironment(file_name="environments/{}/{}".format(env_dir, env_file))

    # get default brain
    brain_name = env.brain_names[0]
    brain      = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents in the environment
    # print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    # print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    # print('States look like:', state)
    state_size = len(state)
    # print('States have length:', state_size)

    report = Report(DQNAgent(state_size=state_size, action_size=action_size, seed=seed, prioritized=prioritized, cpu=cpu)).run(dqn, env=env, brain_name=brain_name, n_episodes=n_episodes)
    print(report)
Exemplo n.º 13
0
def purge_round():
    candidate_leaders_map = {}  # {filename --> agent}

    # Load in all of the leaders
    for leader_checkpoint in os.listdir(LEADER_DIR):
        path = os.path.join(LEADER_DIR, leader_checkpoint)
        candidate_leader = try_gpu(
            DQNAgent(6,
                     LinearSchedule(0.05, 0.05, 1),
                     OBSERVATION_MODE,
                     lr=LR,
                     max_grad_norm=GRAD_CLIP_NORM,
                     name=leader_checkpoint))
        candidate_leader.load_state_dict(
            torch.load(path, map_location=lambda storage, loc: storage))
        candidate_leaders_map[leader_checkpoint] = candidate_leader

    candidate_scores = []  # list[(filename, score)]
    filenames, candidate_leaders = zip(*candidate_leaders_map.items())
    for i, (filename,
            candidate_leader) in enumerate(zip(filenames, candidate_leaders)):
        print "EVALUATING {}".format(candidate_leader.name)
        leaders = EnsembleDQNAgent(candidate_leaders[:i] +
                                   candidate_leaders[i + 1:])
        candidate_scores.append((filename,
                                 evaluate(candidate_leader, leaders,
                                          EPISODES_EVALUATE_PURGE)))
    sorted_scores = sorted(candidate_scores, key=lambda x: x[1], reverse=True)

    print "SCORES: {}".format(sorted_scores)
    for filename, score in sorted_scores[NUM_LEADERS:]:
        print "PURGING ({}, {})".format(filename, score)
        leader_path = os.path.join(LEADER_DIR, filename)
        graveyard_path = os.path.join(GRAVEYARD_DIR, filename)
        os.rename(leader_path, graveyard_path)
Exemplo n.º 14
0
def advise():
    n1 = float(request.form['n1'])
    n2 = float(request.form['n2'])
    n3 = float(request.form['n3'])
    cash = float(request.form['cash'])
    print(n1)
    print(cash)

    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)
    agent.load("202005011635-dqn.h5")

    state = env.reset()
    state[0] = n1
    state[1] = n2
    state[2] = n3
    state[-1] = cash
    state = scaler.transform([state])

    action = agent.act(state)
    # action_combo = list(map(list, itertools.product([0, 1, 2], repeat=3)))
    action_vec = action_combo[action]
    # action_map = {0: "sell", 1: "hold", 2: "buy"}

    # print(action_map[action_vec[0]], action_map[action_vec[1]], action_map[action_vec[2]])

    ans = []
    tmp = 1 if action_vec[0] == 0 and n1 == 0 else action_vec[0]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])
    tmp = 1 if action_vec[1] == 0 and n2 == 0 else action_vec[1]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])
    tmp = 1 if action_vec[2] == 0 and n3 == 0 else action_vec[2]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])

    print(ans)
    return render_template('index.html',
                           ans=ans,
                           n1=n1,
                           n2=n2,
                           n3=n3,
                           cash=cash)
Exemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(description='DQN')
    parser.add_argument('--env', type=str,
                        default='MsPacman-v0')  # 'Breakout-v0'
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--eps', type=float, default=1.0)
    parser.add_argument('--exploration_decay_speed', type=int, default=1000000)
    parser.add_argument('--eps_min', type=float, default=0.1)
    parser.add_argument('--log_size', type=int, default=100)
    parser.add_argument('--buffer_size', type=int, default=100000)
    parser.add_argument('--buffer_init_size', type=int, default=50000)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--sync_period', type=int, default=10000)
    parser.add_argument('--learn_freq', type=int, default=4)
    parser.add_argument('--save_freq', type=int, default=100)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--device', type=str, default='cuda')
    parser.add_argument('--exp-dir', type=str, default='exp')
    args = parser.parse_args()
    args.device = torch.device(args.device if torch.cuda.is_available() \
        and args.device.startswith('cuda') else 'cpu')

    work_dir = mkdir(args.exp_dir, args.env)  # save models

    # logging infos
    logging.basicConfig(filename=args.env + '.log',
                        filemode='w',
                        level=logging.INFO)

    env = gym.make(args.env)

    # set seed
    env.seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    torch.manual_seed(args.seed)
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

    agent = DQNAgent(env, args, work_dir)
    agent.run()
Exemplo n.º 16
0
def main():
    if len(sys.argv) >1:
        host = sys.argv[1]
    epi_file=open('../files/episode.txt')
    episode = epi_file.readline()
    epi_file.close()
    episode = int(episode)
    qagent=DQNAgent(14)
    data = 'x'
    while(data!='9'):
        data = send_action(9)
    ys,ds=qagent.get_data(episode,0)
    state = np.concatenate((ys,ds),axis=0)

    for step in range(1,t_steps+1):
        action = qagent.get_action(state)
        # action = qagent.get_action(state)
        reward = send_action(action)
        ys,ds = qagent.get_data(episode,step)
        n_state = np.concatenate((ys,ds),axis=0)
        state = n_state
Exemplo n.º 17
0
def main(argv):
    # Pretrained network to use
    inputfile = None
    # Wether to train or to test
    train = False
    # Trained network
    outputfile = None

    try:
        opts, args = getopt.getopt(argv, "hrl:s:", ["loadckpt=", "saveckpt="])
    except getopt.GetoptError:
        print 'Incorrect usage. For more information: test.py -h'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'python test.py -r -l <ckptfile> -s <ckptfile>'
            print '-r for enabling training'
            print '-l for loading pre-existing model'
            print '-s for saving  model to file'
            sys.exit()
        elif opt == '-r':
            train = True
        elif opt in ("-l", "--loadckpt"):
            inputfile = arg
        elif opt in ("-s", "--saveckpt"):
            outputfile = arg

    with tf.Session() as sess:
        env = Environment()
        agent = DQNAgent(env, sess, inputfile)
        if train:
            agent.train(6000000, outputfile)
        else:
            agent.test(2000)
Exemplo n.º 18
0
    def __init__(self,
                 env_creator,
                 device,
                 buffer_size,
                 save_dir,
                 timesteps_per_epoch=1,
                 batch_size=32,
                 total_steps=5 * 10 ** 5,
                 decay_rate=0.1,
                 init_epsilon=1,
                 final_epsilon=0.02,
                 loss_freq=50,
                 refresh_target_network_freq=500,
                 eval_freq=500,
                 max_grad_norm=50):

        self.env_creator = env_creator
        self.env = env_creator()
        n_actions = self.env.action_space.n
        state_shape = self.env.observation_space.shape

        self.save_dir = save_dir
        self.buffer_size = buffer_size
        self.timesteps_per_epoch = timesteps_per_epoch
        self.batch_size = batch_size
        self.total_steps = total_steps
        self.decay_steps = decay_rate * total_steps
        self.init_epsilon = init_epsilon
        self.final_epsilon = final_epsilon
        self.loss_freq = loss_freq
        self.refresh_target_network_freq = refresh_target_network_freq
        self.eval_freq = eval_freq
        self.max_grad_norm = max_grad_norm
        self.device = device

        self.writer = SummaryWriter('runs')

        self.agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)
        self.target_network = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)
        self.target_network.load_state_dict(self.agent.state_dict())
Exemplo n.º 19
0
def main():
    if len(sys.argv) > 1:
        host = sys.argv[1]
    ep_reward_file = memory_path + 'ep_reward.dat'
    epi_file = open('../files/episode.txt')
    episode = epi_file.read(1)
    epi_file.close()
    qagent = DQNAgent()
    data = 'x'
    while (data != '9'):
        data = send_action(9)
    ys, ds = qagent.get_data(episode, 0)
    state = np.concatenate((ys, ds), axis=0)
    actions = []
    rewards = []
    for step in range(1, t_steps + 1):
        action = qagent.e_get_action(state)
        # action = qagent.get_action(state)
        reward = send_action(action)
        ys, ds = qagent.get_data(episode, step)
        n_state = np.concatenate((ys, ds), axis=0)
        actions.append(action)
        rewards.append(reward)
        state = n_state
    #save-actions,rewards
    actions = map(str, actions)
    rewards = map(str, rewards)
    r_file = open(reward_file, 'a')
    a_file = open(action_file, 'a')
    r_str = ','.join(rewards)
    a_str = ','.join(actions)
    r_file.write(r_str + '\n')
    a_file.write(a_str + '\n')
    r_file.close()
    a_file.close()
    print("episode : ", episode, " finished.")
    episode = str(int(episode) + 1)
    epi_file = open('../files/episode.txt', 'w')
    epi_file.write(episode)
    epi_file.close()
Exemplo n.º 20
0
def eval():
    env = Tetris()
    max_steps = None
    epsilon_stop_episode = 1500
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    replay_start_size = 2000
    n_neurons = [32, 32]
    render_delay = None
    activations = ['relu', 'relu', 'linear']

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons,
                     activations=activations,
                     epsilon=0,
                     epsilon_stop_episode=epsilon_stop_episode,
                     mem_size=mem_size,
                     discount=discount,
                     replay_start_size=replay_start_size,
                     train=False)
    agent.load("ckpts/591_model.ckpt")

    current_state = env.reset()
    done = False
    steps = 0

    # Game
    while not done and (not max_steps or steps < max_steps):
        next_states = env.get_next_states()
        best_state = agent.best_state(next_states.values())

        best_action = None
        for action, state in next_states.items():
            if state == best_state:
                best_action = action
                break

        reward, done = env.play(best_action[0],
                                best_action[1],
                                render=True,
                                render_delay=render_delay)

        agent.add_to_memory(current_state, next_states[best_action], reward,
                            done)
        current_state = next_states[best_action]
        steps += 1
Exemplo n.º 21
0
def DQN(episodes, epsilon, epsilonDeca):
    env = Env()
    agent = DQNAgent()
    #window=pygame.display.set_mode((windowWidth,windowHeight))
    episodeRewards = []
    for episode in range(episodes):
        episode_reward = 0
        step = 1
        current_state = env.reset()
        done = False
        while not done:

            # This part stays mostly the same, the change is to query a model for Q values
            if np.random.random() > epsilon:
                # Get action from Q table

                action = np.argmax(agent.getQs(np.array(current_state)))
            else:
                # Get random action
                action = np.random.randint(0, env.ACTION_SPACE_SIZE)

            new_state, reward, done = env.step(action)
            episode_reward += reward

            #drawWindow(window,[env.blob,env.enemyBlob],[env.ball],env.wall)

            # Every step we update replay memory and train main network
            agent.updateReplyMemory(
                (current_state, action, reward, new_state, done))
            agent.train(done, step)
            current_state = new_state
            step += 1
        episodeRewards.append(episode_reward)
        if episode % 10 == 0:
            averageReward = sum(episodeRewards) / len(episodeRewards)
            minReward = min(episodeRewards)
            maxReward = max(episodeRewards)
            print(
                f"replayMemo:{len(agent.replayMemory)}  avg:{averageReward} \n  min:{minReward}  \n  max:{maxReward} "
            )
        if epsilon > MIN_EPSILON:
            epsilon *= EPSILON_DECAY
            epsilon = max(MIN_EPSILON, epsilon)

    pygame.quit()
Exemplo n.º 22
0
def main():
    # enable GPU memory growth
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # model
    model_name = input("Model name -> ")
    model_file = input("Model file -> ")
    my_model = "models/{}/{}.h5".format(model_name, model_file)

    epsilon = float(input("Epsilon -> "))
    episode_count = int(input("Episode count -> "))

    print("Loading", my_model, "with epsilon", epsilon)
    agent = DQNAgent(my_model, float(epsilon))

    # information
    resizeScale = (40, 30)
    frame_n = 3
    max_cte = 4.35

    # statistics
    score = []
    rewards = []
    highest_score = 0
    highest_reward = 0
    max_score = None

    # velocity
    max_velocity = 10.0
    max_acceleration = 0.75

    # steering
    max_steering = 0.75
    steering_step = 2 * max_steering / (agent.action_space - 1)
    steering_table = [
        i * steering_step - max_steering for i in range(agent.action_space)
    ]

    # setup donkey environment
    conf = {
        # "exe_path":"remote",
        "exe_path": "D:/sdsandbox/build2/donkey_sim.exe",
        "host": "127.0.0.1",
        "port": 9094,
        "body_style": "donkey",
        "body_rgb": (128, 128, 128),
        "car_name": "rl",
        "font_size": 100
    }

    # env = gym.make("donkey-generated-roads-v0", conf=conf)
    env = gym.make("donkey-generated-track-v0", conf=conf)
    env.viewer.handler.max_cte = max_cte
    cv2.namedWindow("camera")

    start = time.time()
    first_start = start

    for e in range(episode_count):
        # at each episode, reset environment to starting position
        state = env.reset()
        states = np.empty((frame_n, resizeScale[1], resizeScale[0], 3))
        states[0] = preprocessImage(state, resizeScale)
        need_frames = frame_n - 1

        done = False
        score.append(0)
        rewards.append(0.0)
        last_velocity = [0.0]
        laps = 0
        start = time.time()

        while not done and (score[-1] < max_score if max_score else True):
            if need_frames > 0:
                next_state, reward, done, info = env.step([
                    steering_table[random.randint(0, agent.action_space - 1)],
                    0.15
                ])

                states[frame_n - need_frames] = preprocessImage(
                    next_state, resizeScale)
                need_frames -= 1

                last_velocity.append(info["speed"])
                continue

            # select action, observe environment, calculate reward
            action, Q = agent.act(np.asarray([states]))
            steering = steering_table[action]
            throttle = calculateThrottle(last_velocity[-1], max_velocity,
                                         max_acceleration)

            next_state, reward, done, info = env.step([steering, throttle])

            img = cv2.resize(next_state, (320, 240),
                             interpolation=cv2.INTER_AREA)
            cv2.imshow("camera", img)

            last_velocity.append(round(info["speed"], 4))
            if abs(info["cte"]) >= max_cte:
                done = True
                reward = -1.0

            # for track
            else:
                reward = (1.0 - (abs(info["cte"]) / max_cte))

            # for roads
            # if not done:
            # reward = (1.0 - (abs(info["cte"]) / max_cte));

            if info["lap_finished"]:
                laps += 1

            score[-1] += 1
            rewards[-1] += reward

            # for roads
            # if self.score[-1] > 1500:
            # laps = max_laps

            next_states = np.roll(states, -1, axis=0)
            next_states[-1] = preprocessImage(next_state, resizeScale)
            states = next_states

            cv2.waitKey(1)

        env.step([0.0, -0.03])

        if len(score) > 20: score = score[-20:]
        if len(rewards) > 20: rewards = rewards[-20:]

        if score[-1] >= highest_score:
            highest_score = score[-1]

        if rewards[-1] >= highest_reward:
            highest_reward = rewards[-1]

        print(
            "episode: {}/{}, score: {}, reward: {}, laps: {}, e: {:.2}".format(
                e + 1, episode_count, score[-1], round(rewards[-1], 2), laps,
                round(agent.epsilon, 2)))

        if (e + 1) % 5 == 0:
            print("Took", round((time.time() - start) / 60, 2), "minutes\n")
            start = time.time()

    print("Showcase time:", round((time.time() - first_start) / 60, 2),
          "minutes")
Exemplo n.º 23
0
from utils import get_args


# Take argument
arg = get_args()

# Build env (first level, right only)
env = gym_super_mario_bros.make(arg.env)
env = JoypadSpace(env, RIGHT_ONLY)
env = wrapper(env)
# Parameters
states = (84, 84, 4)
actions = env.action_space.n

# Agent
agent = DQNAgent(states=states, actions=actions, max_memory=100000, double_q=True)

# Episodes
# episodes = 100001
episodes = 101
rewards = []

# Timing
start = time.time()
step = 0

# Main loop
for e in range(episodes):

    # Reset env
    state = env.reset()
Exemplo n.º 24
0
def main():

    print("Creating model...")
    model = modelutils.create_model(number_of_actions=4)
    model.summary()

    print("Creating agent...")
    if agent_type == "dqn":
        agent = DQNAgent(name="doom-dqn",
                         model=model,
                         number_of_actions=4,
                         gamma=0.99,
                         final_epsilon=0.0001,
                         initial_epsilon=0.1,
                         number_of_iterations=200000,
                         replay_memory_size=10000,
                         minibatch_size=32)
    elif agent_type == "ddqn":
        agent = DDQNAgent(name="doom-ddqn",
                          model=model,
                          number_of_actions=4,
                          gamma=0.99,
                          final_epsilon=0.0001,
                          initial_epsilon=0.1,
                          number_of_iterations=200000,
                          replay_memory_size=10000,
                          minibatch_size=32,
                          model_copy_interval=100)
    agent.enable_rewards_tracking(rewards_running_means_length=1000)
    agent.enable_episodes_tracking(episodes_running_means_length=1000)
    agent.enable_maxq_tracking(maxq_running_means_length=1000)
    agent.enable_model_saving(model_save_frequency=10000)
    agent.enable_plots_saving(plots_save_frequency=10000)

    print("Creating game...")
    #environment = Environment(headless=("headless" in sys.argv))
    # Create an instance of the Doom game.
    environment = DoomGame()
    environment.load_config("scenarios/basic.cfg")
    environment.set_screen_format(ScreenFormat.GRAY8)
    environment.set_window_visible("headless" not in sys.argv)
    environment.init()

    print("Training ...")
    train(agent, environment, verbose="verbose" in sys.argv)
Exemplo n.º 25
0
world = World(args.config_file, thread_num=args.thread)

# create agents
agents = []

#parameters['buffer_size'] = parameters['buffer_size']*len(world.intersections)
#parameters['batch_size'] = parameters['batch_size']*len(world.intersections)

for i in world.intersections:
    action_space = gym.spaces.Discrete(len(i.phases))
    agents.append(
        DQNAgent(
            action_space,
            LaneVehicleGenerator(world,
                                 i, ["lane_count"],
                                 in_only=True,
                                 average=None,
                                 scale=.025),
            PressureRewardGenerator(world, i, scale=0.005, negative=True),
            i.id, parameters, world))
    if args.load_model:
        agents[-1].load_model(args.save_dir)

# Create metric
metric = [
    TravelTimeMetric(world),
    ThroughputMetric(world),
    SpeedScoreMetric(world),
    MaxWaitingTimeMetric(world)
]
Exemplo n.º 26
0
import numpy as np
from agent import DQNAgent
from utils import make_env

if __name__ == "__main__":
    env = make_env('PongNoFrameskip-v4')
    best_score = -np.inf
    n_games = 200
    agent = DQNAgent(gamma=0.99,
                     epsilon=1.0,
                     lr=1e-4,
                     n_actions=env.action_space.n,
                     input_dims=(env.observation_space.shape),
                     mem_size=50000,
                     batch_size=32,
                     eps_min=0.1,
                     eps_dec=1e-5,
                     tau=1000,
                     env_name='PongNoFrameskip-v4',
                     chkpt_dir='models/')

    n_steps = 0
    scores, eps_history = [], []

    for i in range(n_games):
        done = False
        state = env.reset()
        score = 0

        while not done:
            action = agent.choose_action(state)
Exemplo n.º 27
0
def main(config, max_num_of_steps, max_num_of_episodes, load_model, save_model,
         load_memory, save_memory, log_path):
    agent = DQNAgent(config)

    with agent.graph.as_default():
        if load_model:
            step = agent.load_model(load_model)
            screen_log.info("Load model: {}".format(load_model))
            screen_log.info("Start from step {}".format(step))
        else:
            step = 0

        if load_memory:
            agent.load_memory(load_memory)
            n_frames = len(agent.memory)
            screen_log.info("Load memory: {}".format(load_memory))
            screen_log.info("Memory size: {}".format(n_frames))

        log_name = ('{:02}{:02}{:02}{:02}{:02}'.format(*time.localtime()[1:6]))
        summary_writer = tf.summary.FileWriter(logdir=os.path.join(
            log_path, '{}'.format(log_name)),
                                               graph=agent.graph)

        episode = 0
        rewards_per_episode = []
        sum_Qs = .0
        sum_losses = .0

        try:
            while (step < max_num_of_steps and episode < max_num_of_episodes):
                episode += 1
                episode_done = False

                next_observation = reset_random_env()
                next_observation = preprocess_observation(next_observation)

                rewards_per_episode.append(0)

                while not episode_done:
                    observation = next_observation

                    if len(agent.memory) < config['replay_start_size']:
                        # init replay memory
                        action = env.action_space.sample()

                        next_observation, reward, episode_done, info = env.step(
                            action)
                        next_observation = preprocess_observation(
                            next_observation)
                        agent.memory.append(
                            MemoryItem(observation, action, reward,
                                       episode_done, info))

                        continue

                    state = agent.get_recent_state(observation)
                    Qs = agent.get_Q_values(state)
                    Qs = Qs[0]

                    # epsilon-greedy action selection
                    epsilon = get_epsilon(config, step)
                    if np.random.RandomState().rand() < epsilon:
                        action = env.action_space.sample()
                    else:
                        action = agent.get_action_from_Q(Qs)

                    next_observation, reward, episode_done, info = env.step(
                        action)
                    next_observation = preprocess_observation(next_observation)
                    agent.memory.append(
                        MemoryItem(observation, action, reward, episode_done,
                                   info))

                    step += 1
                    rewards_per_episode[-1] += reward
                    sum_Qs += Qs[action]

                    # train step
                    loss, loss_summary_str = agent.optimize_Q()
                    summary_writer.add_summary(loss_summary_str, step)
                    sum_losses += loss

                    if step % 1000 == 0:
                        ave_loss = sum_losses / step
                        ave_reward = np.mean(rewards_per_episode)
                        ave_Q = sum_Qs / step

                        [Q_summary_str, reward_summary_str
                         ] = agent.evaluate(ave_reward, ave_Q)

                        summary_writer.add_summary(Q_summary_str, step)
                        summary_writer.add_summary(reward_summary_str, step)

                        screen_log.info(
                            'step: {}, ave. loss: {:g}, '
                            'ave. reward: {:g}, ave. Q: {:g}'.format(
                                step,
                                ave_loss,
                                ave_reward,
                                ave_Q,
                            ))
                    if step % 10000 == 0:
                        agent.save_model(save_model, step)
                    if step % 1000000 == 0:
                        agent.save_memory(save_memory, step)

        except KeyboardInterrupt:
            print("\nUser interrupted training...")
        finally:
            summary_writer.close()

            agent.save_model(save_model, step)
            agent.save_memory(save_memory, step)

        screen_log.info(
            'Finished: the number of steps {}, the number of episodes {}.'.
            format(step, episode))
Exemplo n.º 28
0
  maybe_make_dir('weights')
  maybe_make_dir('portfolio_val')

  timestamp = time.strftime('%Y%m%d%H%M')

  data = np.around(get_data())
  data_size = data.shape[1]
  data_cut_point = int(0.75*data_size)
  train_data = data[:, :data_cut_point]
  test_data = data[:, data_cut_point:]

  env = TradingEnv(train_data, args.initial_invest)
  state_size = env.observation_space.shape
  action_size = env.action_space.n
  agent = DQNAgent(state_size, action_size)
  scaler = get_scaler(env)

  portfolio_value = []

  if args.mode == 'test':
    # remake the env with test data
    env = TradingEnv(test_data, args.initial_invest)
    # load trained weights
    agent.load(args.weights)
    # when test, the timestamp is same as time when weights was trained
    timestamp = re.findall(r'\d{12}', args.weights)[0]

  for e in range(args.episode):
    state = env.reset()
    state = scaler.transform([state])
Exemplo n.º 29
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description="Run DQN on iLOCuS")
    parser.add_argument("--network_name",
                        default="deep_q_network",
                        type=str,
                        help="Type of model to use")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size")
    parser.add_argument("--map_shape",
                        default=(15, 15),
                        type=tuple,
                        help="map size")
    parser.add_argument("--num_actions",
                        default=4,
                        type=int,
                        help="level of pricing")

    parser.add_argument("--gamma",
                        default=0.8,
                        type=float,
                        help="Discount factor")
    parser.add_argument("--alpha",
                        default=0.0001,
                        type=float,
                        help="Learning rate")
    parser.add_argument("--epsilon",
                        default=0.5,
                        type=float,
                        help="Exploration probability for epsilon-greedy")
    parser.add_argument("--target_update_freq",
                        default=10000,
                        type=int,
                        help="Frequency for copying weights to target network")
    parser.add_argument(
        "--num_iterations",
        default=5000000,
        type=int,
        help="Number of overal interactions to the environment")
    parser.add_argument("--max_episode_length",
                        default=200000,
                        type=int,
                        help="Terminate earlier for one episode")
    parser.add_argument("--train_freq",
                        default=4,
                        type=int,
                        help="Frequency for training")
    parser.add_argument("--num-burn-in",
                        default=10000,
                        type=int,
                        help="number of memory before train")

    parser.add_argument("-o",
                        "--output",
                        default="ilocus-v0",
                        type=str,
                        help="Directory to save data to")
    parser.add_argument("--seed", default=0, type=int, help="Random seed")
    parser.add_argument("--train",
                        default=True,
                        type=bool,
                        help="Train/Evaluate, set True if train the model")
    parser.add_argument("--model_path",
                        default="atari-v0",
                        type=str,
                        help="specify model path to evaluation")
    parser.add_argument("--max_grad",
                        default=1.0,
                        type=float,
                        help="Parameter for huber loss")
    parser.add_argument("--log_dir",
                        default="log",
                        type=str,
                        help="specify log folder to save evaluate result")
    parser.add_argument(
        "--flip_coin",
        default=False,
        type=str,
        help="specify whether or not choosing double q learning")
    parser.add_argument("--eval_num",
                        default=100,
                        type=int,
                        help="number of evaluation to run")
    parser.add_argument("--save_freq",
                        default=100000,
                        type=int,
                        help="model save frequency")

    # memory related args
    parser.add_argument("--buffer_size",
                        default=100000,
                        type=int,
                        help="reply memory buffer size")
    parser.add_argument(
        "--look_back_steps",
        default=4,
        type=int,
        help="how many previous pricing tables will be fed into RL")

    args = parser.parse_args()
    print("\nParameters:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    # Initiating policy for both tasks (training and evaluating)
    policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0.1, 1000000,
                                            args.num_actions)

    if not args.train:
        '''Evaluate the model'''
        # check model path
        if args.model_path is '':
            print("Model path must be set when evaluate")
            exit(1)

        # specific log file to save result
        log_file = os.path.join(args.log_dir, args.network_name,
                                str(args.model_num))
        model_dir = os.path.join(args.model_path, args.network_name,
                                 str(args.model_num))

        with tf.Session() as sess:
            # load model
            # with open(model_dir + ".json", 'r') as json_file:
            #     loaded_model_json = json_file.read()
            #     q_network_online = model_from_json(loaded_model_json)
            #     q_network_target = model_from_json(loaded_model_json)
            #
            # sess.run(tf.global_variables_initializer())
            #
            # # load weights into model
            # q_network_online.load_weights(model_dir + ".h5")
            # q_network_target.load_weights(model_dir + ".h5")

            driver_sim = DriverSim()
            env = Environment(driver_sim=driver_sim)

            memory = ReplayMemory(args.buffer_size, args.look_back_steps)
            q_network = create_model(args.look_back_steps, args.map_shape,
                                     args.num_actions)
            dqn_agent = DQNAgent(q_network=q_network,
                                 memory=memory,
                                 policy=policy,
                                 gamma=args.gamma,
                                 target_update_freq=args.target_update_freq,
                                 num_burn_in=args.num_burn_in,
                                 train_freq=args.train_freq,
                                 batch_size=args.batch_size)
        exit(0)
    '''Train the model'''

    with tf.Session() as sess:
        # with tf.device('/cpu:0'):
        print("created model")

        driver_sim = DriverSim()
        env = Environment(driver_sim=driver_sim)
        print("set up environment")

        # # create output dir, meant to pop up error when dir exist to avoid over written
        # os.mkdir(args.output + "/" + args.network_name)

        memory = ReplayMemory(args.buffer_size, args.look_back_steps)
        q_network = create_model(args.look_back_steps, args.map_shape,
                                 args.num_actions)
        dqn_agent = DQNAgent(q_network=q_network,
                             memory=memory,
                             policy=policy,
                             gamma=args.gamma,
                             target_update_freq=args.target_update_freq,
                             num_burn_in=args.num_burn_in,
                             train_freq=args.train_freq,
                             batch_size=args.batch_size)
        print("defined dqn agent")

        optimizer = Adam(learning_rate=args.alpha)
        q_network.compile(optimizer, mean_huber_loss)

        sess.run(tf.global_variables_initializer())

        print("initializing environment")
        env.reset()

        print("in fit")
        if os.path.exists(args.output):
            shutil.rmtree(args.output)
        os.mkdir(args.output)
        dqn_agent.fit(env=env,
                      num_iterations=args.num_iterations,
                      output_dir=os.path.join(args.output),
                      max_episode_length=args.max_episode_length)
# Shift slowly from exploration to exploitation
eps_decay = 0.0005
# Never move to full exploitation, leave some time for exploration
eps_end = 0.998

# Define some variables to keep track of training progress
# Empty dict for all agents
action_dict = dict()
# Score for all rewards
score = 0

# ------------------------------------------------------
# 4. Load the agent
# ------------------------------------------------------
# Load the agent
agent = DQNAgent(state_size=state_size, action_size=action_size)
# Load the weights (if pretrained agent)
# agent.load("run-003.ckpt")
# agent.q_act.set_weights(agent.q_learn.get_weights())

# ------------------------------------------------------
# 5. Main training loop
# ------------------------------------------------------

for trial in range(1, n_trials + 1):

    # Reset the environment
    obs = env.reset()
    obs = obs[0]
    env_renderer.reset()
Exemplo n.º 31
0
from flask.ext.socketio import SocketIO, emit

app = Flask(__name__)
app.config["SECRET_KEY"] = "secret!"
socketio = SocketIO(app)

this_dir = os.path.abspath(os.path.dirname(__file__))
deepy_dir = os.path.abspath(this_dir + os.sep + ".." + os.sep + "..")
model_path = this_dir + os.sep + "models" + os.sep + "puckworld_model1.gz"

import sys

sys.path.append(deepy_dir)
from agent import DQNAgent

agent = DQNAgent(8, 5)
if os.path.exists(model_path):
    print "Load model:", model_path
    agent.load(model_path)


@app.route("/")
def index():
    return render_template_string(open(this_dir + os.sep + "test.html").read())


@socketio.on("act", namespace="/test")
def test_action(message):
    action = agent.action(message["state"])
    emit("act", {"action": action})