Пример #1
0
    def __init__(self):
        self.params = HYPERPARAMS
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.env = gym.make('BeamRider-v4')
        self.env = wrap_dqn(self.env)

        self.policy_net = DQN(self.env.observation_space.shape,
                              self.env.action_space.n).to(self.device)
        if torch.cuda.device_count() > 1:
            print('using %s gpus' % torch.cuda.device_count())
            self.policy_net = nn.DataParallel(self.policy_net)
        self.target_net = copy.deepcopy(self.policy_net)
        self.epsilon_tracker = EpsilonTracker(self.params)
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.params['learning_rate'])
        self.reward_tracker = RewardTracker()
        self.transition = namedtuple(
            'Transition', ('state', 'action', 'reward', 'next_state', 'done'))
        self.memory = ReplayBuffer(self.params['replay_size'])
        # self.memory = Memory(self.params['replay_size'], 0.6)
        self.beta_scheduler = LinearScheduler(
            0.4, 1.0, timespan=self.params['epsilon_frames'])
        self.state = self.preprocess(self.env.reset())
        self.score = 0
        self.batch_size = self.params['batch_size']
        self.tb_writer = SummaryWriter('results')
Пример #2
0
 def playback(self, path):
     target_net = torch.load(path, map_location='cpu')
     env = gym.make('PongNoFrameskip-v4')
     env = wrap_dqn(env)
     state = self.preprocess(env.reset())
     done = False
     score = 0
     import time
     while not done:
         time.sleep(0.015)
         action = torch.argmax(target_net(state), dim=1).to(self.device)
         state, reward, done, _ = env.step(action.item())
         state = self.preprocess(state)
         score += reward
     print("Score: ", score)
Пример #3
0
    def __init__(self):
        self.params = HYPERPARAMS
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.env = gym.make('PongNoFrameskip-v4')
        self.env = wrap_dqn(self.env)

        self.policy_net = DQN(self.env.observation_space.shape, self.env.action_space.n, self.device).to(self.device)
        self.target_net = copy.deepcopy(self.policy_net)
        self.epsilon_tracker = EpsilonTracker(self.params)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.params['learning_rate'])
        self.reward_tracker = RewardTracker()
        self.transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state'))
        self.memory = ReplayMemory(self.params['replay_size'], self.transition)
        self.episode = 0
        self.state = self.preprocess(self.env.reset())
        self.score = 0
        self.batch_size = self.params['batch_size']
Пример #4
0
    def __init__(self, n_action, is_render=True, is_load=False):
        self.sess = tf.Session()

        self.batch_size = 32

        self.model = DQN(self.sess, n_action, self.batch_size)
        self.model_name = "DQN"

        self.env = wrappers.wrap_dqn(gym.make("BreakoutDeterministic-v4"))
        self.is_render = is_render

        self.EPISODE = 600

        # epsilon parameter
        self.epsilon_s = 1.0
        self.epsilon_e = 0.1
        self.epsilon_decay = 100000
        self.epsilon = self.epsilon_s

        # train parameter
        self.train_start = 5000
        self.update_target_rate = 5000

        self.n_action = n_action
        self.loss = 0

        # info
        self.total_q_max, self.total_loss = 0., 0.

        # save parameter
        self.save_episode_rate = 5

        # load parameter
        self.is_load = is_load
        # saved_model = "./save/{}/{}_episode20.ckpt-{}".format("20180613-132735", self.model_name, "3741")
        self.saved_model = tf.train.latest_checkpoint("./save/20180614-180138")
Пример #5
0
    parser.add_argument(
        "--reward",
        type=float,
        default=STOP_REWARD,
        help="Mean reward boundary for stop of training, default=%.2f" %
        STOP_REWARD)
    args = parser.parse_args()
    device = torch.device("cuda")

    cp_dir = 'checkpoints/'
    runs_dir = 'runs/'
    os.makedirs(cp_dir, exist_ok=True)
    os.makedirs(runs_dir, exist_ok=True)

    env = gym.make(DEFAULT_ENV_NAME)
    env = wrappers.wrap_dqn(env)

    net = dqn_model.RainbowDQN(env.observation_space.shape,
                               env.action_space.n).to(device)
    tgt_net = dqn_model.RainbowDQN(env.observation_space.shape,
                                   env.action_space.n).to(device)

    date_time = datetime.datetime.now().strftime('%d-%b-%Y_%X_%f')
    run_name = f'{DEFAULT_ENV_NAME}_{date_time}'
    writer = SummaryWriter(runs_dir + run_name)

    quantile_tau = [i / N_QUANTILES for i in range(1, N_QUANTILES + 1)]
    quantile_tau = torch.tensor(quantile_tau).to(device)

    agent = Agent(net, device=device)
    exp_source = experience.ExperienceSourceFirstLast(env,
Пример #6
0
        self.advantage_head = nn.Linear(512, n_actions)
        

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = x.view(x.size(0), -1)
        
        value = self.value_head(F.relu(self.value(x)))
        advantage = self.advantage_head(F.relu(self.advantage(x)))
        
        return value + (advantage - advantage.mean())


env = wrap_dqn(gym.make('PongNoFrameskip-v4'))


q_func = Net(6)
q_func.load_state_dict(torch.load(sys.argv[1]))
q_func.cuda()

def var(x):
    x = np.array(x).reshape(1, 4, 84, 84)
    x = torch.from_numpy(x)
    
    return Variable(x).type(torch.FloatTensor).cuda()

def select_action(x):
    if random.random() < 0.02:
        return env.action_space.sample()
Пример #7
0
obse = np.resize(state_grey,(84*84))
stacked = np.stack((state_grey,state_grey, state_grey, state_grey), axis = 0)
print(stacked, stacked.shape)
#print(state,state.shape)
#cv2.imshow('image', state_grey)
#cv2.waitKey(0)

'''

'''
output = tf.image.rgb_to_grayscale(state)

output = tf.image.crop_to_bounding_box(output, 34, 0, 160, 160)
output = tf.image.resize_images(output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
output_2 = tf.squeeze(output)
print(output_2.numpy(), output.numpy().shape)

cv2.imshow('image', output.numpy())

cv2.waitKey(0)
'''

env = wp.wrap_dqn(gym.make('BreakoutDeterministic-v4'))
state = env.reset()
done = False
for _ in range(50):
	if not done: 
		next_state, reward, done, info = env.step(1)
		print('state', next_state, 'done', done)
		time.sleep(0.1)
Пример #8
0
def trainer(MINIBATCH_SIZE=32,
            GAMMA=0.99,
            load=True,
            save=True,
            epsilon=1.0,
            min_epsilon=0.1,
            BUFFER_SIZE=500000,
            train_indicator=True,
            render=True):
    with tf.Session() as sess:

        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        # set evironment
        # robot = gym_environment('FrozenLakeNonskid4x4-v3', False, False, False)
        # breakout
        # env = gym.make('BreakoutDeterministic-v4')
        env = wp.wrap_dqn(gym.make('BreakoutDeterministic-v4'))
        # Pong-v0
        # env= wp.wrap_dqn(gym.make('PongDeterministic-v4'))
        agent = Network(sess, SIZE_FRAME, N_ACTIONS, LEARNING_RATE, DEVICE)

        # TENSORFLOW init seession
        sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        agent.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
        replay_buffer.load()
        print('buffer size is now', replay_buffer.count)
        # this is for loading the net
        if load:
            try:
                agent.recover()
                print('********************************')
                print('models restored succesfully')
                print('********************************')
            except tf.errors.NotFoundError:
                print('********************************')
                print('Failed to restore models')
                print('********************************')

        total_frames_counter = 0
        frames_number = 0
        frames_to_save = 0
        while total_frames_counter < 10000000:

            if frames_to_save > 10000:
                agent.save()
                frames_to_save = 0

            if frames_number > 10000:
                agent.update_target_network()
                frames_number = 0
                print('update_target_network')
                # agent.save()
                # replay_buffer.save()

            state = env.reset()
            q0 = np.zeros(N_ACTIONS)
            ep_reward = 0.
            done = False
            step = 0
            total_loss = deque()
            loss = 0.
            while not done:

                frames_number = frames_number + 1
                frames_to_save = frames_to_save + 1
                total_frames_counter = total_frames_counter + 1
                if total_frames_counter > 20000:
                    epsilon -= 0.00000085
                    epsilon = np.maximum(min_epsilon, epsilon)
                    train_indicator = True
                else:
                    train_indicator = False  #True

                # for visualization
                # numpy_horizontal = np.hstack((np.array(state)[:,:,0], np.array(state)[:,:,1], np.array(state)[:,:,2],np.array(state)[:,:,3]))
                # cv2.imshow('image', numpy_horizontal)
                # cv2.waitKey(1)
                # time.sleep(0.05)

                # 1. get action with e greedy
                if np.random.random_sample() < epsilon:
                    #Explore!
                    action = np.random.randint(0, N_ACTIONS)
                else:
                    # Just stick to what you know bro
                    q0, X = agent.predict(
                        np.reshape(
                            np.array(state).astype(np.uint8),
                            [-1, SIZE_FRAME, SIZE_FRAME, 4]))
                    action = np.argmax(q0)

                next_state, reward, done, info = env.step(
                    action)  #env.step(action)
                # env.render()
                # state = observation

                if train_indicator:
                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:

                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                            MINIBATCH_SIZE)

                        q_eval = agent.predict_target(s2_batch)
                        q_target = np.zeros(MINIBATCH_SIZE)

                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                q_target[k] = r_batch[k]
                            else:
                                q_target[k] = r_batch[k] + GAMMA * np.max(
                                    q_eval[k])

                        #5.3 Train agent!
                        loss, _ = agent.train(
                            np.reshape(a_batch, (MINIBATCH_SIZE, 1)),
                            np.reshape(q_target, (MINIBATCH_SIZE, 1)), s_batch)
                        # in case you want to understand the innner workings of this
                        # target_final, q_acted, delta, loss, optimize = agent.train_v2(np.reshape(a_batch,(MINIBATCH_SIZE,1)),np.reshape(q_target,(MINIBATCH_SIZE,1)), s_batch )
                        # print('target_final', target_final, 'q_acted', q_acted, 'delta', delta, 'loss', loss)

                # 3. Save in replay buffer:
                replay_buffer.add(state, action, reward, done, next_state)
                state = next_state
                ep_reward = ep_reward + reward
                step += 1
                total_loss.append(loss)

            print('th', total_frames_counter + 1, 'Step', step, 'Reward:',
                  ep_reward, 'epsilon', round(epsilon, 3), np.mean(total_loss))

            # print('the reward at the end of the episode,', reward)

        print('*************************')
        print('now we save the model')
        agent.save()
        #replay_buffer.save()
        print('model saved succesfuly')
        print('*************************')