Exemplo n.º 1
0
def main():
    args = parse_arguments()

    results_dir = os.path.join('results', args.id)
    os.makedirs(results_dir, exist_ok=True)
    logger = Logger(results_dir)

    metrics = {
        'steps': [],
        'rewards': [],
        'Qs': [],
        'best_avg_reward': -float('inf')
    }
    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if torch.cuda.is_available() and not args.disable_cuda:
        args.device = torch.device('cuda')
        torch.cuda.manual_seed(np.random.randint(1, 10000))
        torch.backends.cudnn.enabled = args.enable_cudnn
    else:
        args.device = torch.device('cpu')

    if args.tensorboard_dir is None:
        writer = SummaryWriter(
            os.path.join(results_dir, 'tensorboard', args.game,
                         args.architecture))
    else:
        writer = SummaryWriter(
            os.path.join(args.tensorboard_dir, args.game, args.architecture))

    # Environment
    env = Env(args)
    env.train()
    action_space = env.action_space()

    # Agent
    dqn = Agent(args, env)

    # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory
    if args.model is not None and not args.evaluate:
        if not args.memory:
            raise ValueError(
                'Cannot resume training without memory save path. Aborting...')
        elif not os.path.exists(args.memory):
            raise ValueError(
                'Could not find memory file at {path}. Aborting...'.format(
                    path=args.memory))

        mem = load_memory(args.memory, args.disable_bzip_memory)

    else:
        mem = ReplayMemory(args, args.memory_capacity)

    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)

    # Construct validation memory
    val_mem = ReplayMemory(args, args.evaluation_size)
    T, done = 0, True
    while T < args.evaluation_size:
        if done:
            state, done = env.reset(), False

        next_state, _, done = env.step(np.random.randint(0, action_space))
        val_mem.append(state, None, None, done)
        state = next_state
        T += 1

    if args.evaluate:
        dqn.eval()  # Set DQN (online network) to evaluation mode
        avg_reward, avg_Q = test(args,
                                 0,
                                 dqn,
                                 val_mem,
                                 metrics,
                                 results_dir,
                                 evaluate=True)  # Test
        logger.info('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                    str(avg_Q))
    else:
        # Training loop
        dqn.train()
        T, done = 0, True
        accumulate_reward = 0
        for T in trange(1, args.T_max + 1):
            if done:
                state, done = env.reset(), False
                writer.add_scalar('Train/Reward', accumulate_reward, T)
                accumulate_reward = 0

            if T % args.replay_frequency == 0:
                dqn.reset_noise()  # Draw a new set of noisy weights

            action = dqn.act(
                state)  # Choose an action greedily (with noisy weights)
            next_state, reward, done = env.step(action)  # Step
            accumulate_reward += reward
            if args.reward_clip > 0:
                reward = max(min(reward, args.reward_clip),
                             -args.reward_clip)  # Clip rewards
            mem.append(state, action, reward,
                       done)  # Append transition to memory

            # Train and test
            if T >= args.learn_start:
                mem.priority_weight = min(
                    mem.priority_weight + priority_weight_increase,
                    1)  # Anneal importance sampling weight β to 1

                if T % args.replay_frequency == 0:
                    dqn.learn(
                        mem
                    )  # Train with n-step distributional double-Q learning

                if T % args.evaluation_interval == 0:
                    dqn.eval()  # Set DQN (online network) to evaluation mode
                    avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics,
                                             results_dir)  # Test
                    writer.add_scalar('Eval/Reward', avg_reward, T)
                    writer.add_scalar('Eval/Q', avg_Q, T)
                    logger.info('T = ' + str(T) + ' / ' + str(args.T_max) +
                                ' | Avg. reward: ' + str(avg_reward) +
                                ' | Avg. Q: ' + str(avg_Q))
                    dqn.train(
                    )  # Set DQN (online network) back to training mode

                    # If memory path provided, save it
                    if args.memory is not None:
                        save_memory(mem, args.memory, args.disable_bzip_memory)

                # Update target network
                if T % args.target_update == 0:
                    dqn.update_target_net()

                # Checkpoint the network
                if (args.checkpoint_interval !=
                        0) and (T % args.checkpoint_interval == 0):
                    dqn.save(results_dir, 'checkpoint.pth')

            state = next_state

    env.close()
Exemplo n.º 2
0
        action = dqn.act(
            state)  # Choose an action greedily (with noisy weights)
        next_state, reward, done = env.step(action)  # Step
        if args.reward_clip > 0:
            reward = max(min(reward, args.reward_clip),
                         -args.reward_clip)  # Clip rewards
        mem.append(state, action, reward, done)  # Append transition to memory
        T += 1

        if T % args.log_interval == 0:
            log('T = ' + str(T) + ' / ' + str(args.T_max))

        # Train and test
        if T >= args.learn_start:
            mem.priority_weight = min(
                mem.priority_weight + priority_weight_increase,
                1)  # Anneal importance sampling weight β to 1

            if T % args.replay_frequency == 0:
                dqn.learn(
                    mem)  # Train with n-step distributional double-Q learning

            if T % args.evaluation_interval == 0:
                dqn.eval()  # Set DQN (online network) to evaluation mode
                avg_reward, avg_Q = test(args, T, dqn, val_mem)  # Test
                log('T = ' + str(T) + ' / ' + str(args.T_max) +
                    ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                    str(avg_Q))
                dqn.train()  # Set DQN (online network) back to training mode

            # Update target network
Exemplo n.º 3
0
    T, done = 0, True
    for T in tqdm(range(args.T_max)):
        if done:
            state, done = env.reset(), False
        if T % args.replay_frequency == 0:
            dqn.reset_noise()  # Draw a new set of noisy weights

        action = dqn.act(state)  # Choose an action greedily
        next_state, reward, done = env.step(action)  # Step
        if args.reward_clip > 0:
            reward = max(min(reward, args.reward_clip), -args.reward_clip)
        mem.append(state, action, reward, done)  # Append transition to memory

        # Train and test
        if T >= args.learn_start:
            mem.priority_weight = min(
                mem.priority_weight + priority_weight_increase, 1)

        if T % args.replay_frequency == 0:
            dqn.learn(mem)

        if T % args.evaluation_interval == 0:
            dqn.eval()  # Set DQN (online network) to evaluation mode
            avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics,
                                     results_dir)  # Test
            log('T = ' + str(T) + ' / ' + str(args.T_max) + '|Avg.R:' +
                str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
            dqn.train()  # Set DQN (online network) back to training mode

        if T % args.target_update == 0:
            dqn.update_target_net()
Exemplo n.º 4
0
def main():
    args = parser.parse_args()
    print(' ' * 26 + 'Options')
    for k, v in vars(args).items():
        print(' ' * 26 + k + ': ' + str(v))
    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if torch.cuda.is_available() and not args.disable_cuda:
        args.device = torch.device('cuda')
        torch.cuda.manual_seed(np.random.randint(1, 10000))
        # Disable nondeterministic ops (not sure if critical but better safe than sorry)
        #torch.backends.cudnn.enabled = False
    else:
        args.device = torch.device('cpu')

    args.large = False
    args.skip_frames = 0
    args.random_aug = 0.

    # Environment
    train_env = create_env(args.environment_filename,
                           custom=True,
                           large=args.large,
                           skip_frames=args.skip_frames,
                           random_aug=args.random_aug,
                           docker=args.docker_training,
                           device=args.device)
    action_space = train_env.action_space

    test_env = create_env(
        args.environment_filename,
        custom=True,
        large=args.large,
        custom_reward=False,
        skip_frames=args.skip_frames,
        docker=args.docker_training,
        device=args.device,
        worker_id=1,
    )

    mem = ReplayMemory(args,
                       args.memory_capacity,
                       obs_space=train_env.observation_space)
    val_mem = ReplayMemory(args,
                           args.evaluation_size,
                           obs_space=test_env.observation_space)

    # for debugging environment issues
    if args.timeout_monitor:
        train_env = TimeoutMonitor(train_env, mem)
        test_env = TimeoutMonitor(test_env, val_mem)

    # Agent
    dqn = Agent(args, train_env)

    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)
    time_step = 0
    done = True
    state = None
    while time_step < args.evaluation_size:
        if done:
            state = train_env.reset()
            done = False

        next_state, _, done, _ = train_env.step(action_space.sample())
        val_mem.append(state, None, None, done)
        state = next_state
        time_step += 1

    if args.evaluate:
        dqn.eval()  # Set DQN (online network) to evaluation mode
        avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True)  # Test
        print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
    else:
        # Training loop
        dqn.train()
        done = True
        for time_step in tqdm(range(args.T_max)):
            if done:
                state = train_env.reset()
                done = False

            if time_step % args.replay_frequency == 0:
                dqn.reset_noise()  # Draw a new set of noisy weights

            action = dqn.act(
                state)  # Choose an action greedily (with noisy weights)
            next_state, reward, done, info = train_env.step(action)  # Step
            if args.reward_clip > 0:
                reward = max(min(reward, args.reward_clip),
                             -args.reward_clip)  # Clip rewards
            mem.append(state, action, reward,
                       done)  # Append transition to memory

            # Train and test
            if time_step >= args.learn_start:
                # Anneal importance sampling weight β to 1
                mem.priority_weight = min(
                    mem.priority_weight + priority_weight_increase, 1)

                if time_step % args.replay_frequency == 0:
                    dqn.learn(
                        mem
                    )  # Train with n-step distributional double-Q learning

                if time_step % args.evaluation_interval == 0:
                    dqn.eval()  # Set DQN (online network) to evaluation mode
                    avg_reward, avg_Q = test(args,
                                             time_step,
                                             dqn,
                                             val_mem,
                                             env=test_env)  # Test
                    log('T = ' + str(time_step) + ' / ' + str(args.T_max) +
                        ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                        str(avg_Q))
                    dqn.train(
                    )  # Set DQN (online network) back to training mode

                # Update target network
                if time_step % args.target_update == 0:
                    dqn.update_target_net()

            state = next_state

    train_env.close()
def train_agent(env, args, config):
    """
    Args:
    """

    # create CNN convert the [1,3,84,84] to [1, 200]
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    torch.manual_seed(config["seed"])
    np.random.seed(config["seed"])
    if torch.cuda.is_available() and not args.disable_cuda:
        args.device = torch.device('cuda')
        torch.cuda.manual_seed(np.random.randint(1, 10000))
        torch.backends.cudnn.enabled = args.enable_cudnn
    pathname = dt_string + "_seed" + str(config["seed"])
    print("save tensorboard {}".format(config["locexp"]))
    tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
    agent = Agent(args, env)

    #agent.load(str(args.locexp), "1/checkpoint-52038.pth")
    memory = ReplayMemory(args, args.memory_capacity)
    #memory.load_memory("memory_pacman")
    #memory =  ReplayBuffer((3, config["size"], config["size"]), (1,), config["expert_buffer_size"], int(config["image_pad"]), config["device"])
    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)
    writer = SummaryWriter(tensorboard_name)
    results_dir = os.path.join(str(config["locexp"]), args.id)
    mkdir("", results_dir)
    scores_window = deque(maxlen=100)
    steps_window = deque(maxlen=100)
    scores = []
    t0 = time.time()
    # Training loop
    agent.train()
    T, done = 0, True
    print("result dir ", results_dir)
    agent.save(results_dir, 'checkpoint-{}.pth'.format(T))
    #eval_policy(env, agent, writer, T, config)
    episode = -1
    steps = 0
    score = 0
    print("save policy ", args.checkpoint_interval)
    # eval_policy(env, agent, writer, 0, config)
    for T in range(1, args.T_max + 1):
        # print("\r {} of {}".format(T, args.T_max), end='')
        if done:
            episode += 1
            # Checkpoint the network
            if episode % 100 == 0:
                memory.save_memory("memory_pacman")
                print("Eval policy")
                #eval_policy(env, agent, writer, T, config)
                agent.save(results_dir, 'checkpoint-{}.pth'.format(T))
            scores_window.append(score)  # save most recent scor
            scores.append(score)  # save most recent score
            steps_window.append(steps)
            ave_steps = np.mean(steps_window)
            print(
                '\rTime steps {}  episode {} score {} Average Score: {:.2f} steps {} ave steps {:.2f} time: {}'
                .format(T, episode, score, np.mean(scores_window), steps,
                        ave_steps, time_format(time.time() - t0)),
                end="")
            writer.add_scalar('Episode_reward ', score, T)
            average_reward = np.mean(scores_window)
            writer.add_scalar('Average_reward ', average_reward, T)
            state, done = env.reset("mediumClassic"), False
            steps = 0
            score = 0
        if T % args.replay_frequency == 0:
            agent.reset_noise()  # Draw a new set of noisy weights
        action = agent.act(
            state)  # Choose an action greedily (with noisy weights)
        next_state, reward, done, _ = env.step(action)  # Step
        score += reward
        steps += 1
        if steps == 125:
            done = True
        memory.append(state, action, reward,
                      done)  # Append transition to memory

        # Train and test
        if T >= args.learn_start:
            memory.priority_weight = min(
                memory.priority_weight + priority_weight_increase,
                1)  # Anneal importance sampling weight β to 1

            if T % args.replay_frequency == 0:
                agent.learn(
                    memory
                )  # Train with n-step distributional double-Q learning

            # Update target network
            if T % args.target_update == 0:
                agent.update_target_net()

        state = next_state
Exemplo n.º 6
0
def worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    if args.distributed:
        args.seed += args.gpu
        torch.cuda.set_device(args.gpu)

        args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + args.gpu

        torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:8632',
                                             world_size=args.world_size, rank=args.rank)
    else:
        args.rank = 0

    args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available()
    args.no_cuda_train = not torch.cuda.is_available()
    args.verbose = args.verbose and (args.rank == 0)

    env_device = torch.device('cuda', args.gpu) if args.use_cuda_env else torch.device('cpu')
    train_device = torch.device('cuda', args.gpu) if (args.no_cuda_train == False) else torch.device('cpu')

    # Setup
    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if args.use_cuda_env or (args.no_cuda_train == False):
        torch.cuda.manual_seed(random.randint(1, 10000))

    if train_device.type == 'cuda':
        print('Train:\n' + cuda_device_str(train_device.index), flush=True)

    if args.use_openai:
        test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes,
                                              episode_life=False, clip_rewards=False)
        test_env.reset()
    else:
        test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray',
                            device='cpu', rescale=True, clip_rewards=False,
                            episodic_life=False, repeat_prob=0.0, frameskip=4)

    # Agent
    dqn = Agent(args, test_env.action_space)

    # Construct validation memory
    if args.rank == 0:
        print(dqn)
        print('Initializing evaluation memory with {} entries...'.format(args.evaluation_size), end='', flush=True)
        start_time = time.time()

    val_mem = initialize_validation(args, train_device)

    if args.rank == 0:
        print('complete ({})'.format(format_time(time.time() - start_time)), flush=True)

    if args.evaluate:
        if args.rank == 0:
            eval_start_time = time.time()
            dqn.eval()  # Set DQN (online network) to evaluation mode
            rewards, lengths, avg_Q = test(args, 0, dqn, val_mem, test_env, train_device)
            dqn.train()  # Set DQN (online network) back to training mode
            eval_total_time = time.time() - eval_start_time

            rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards)
            lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths)

            print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                  'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                  'Avg. Q: {:4.4f} | {}'
                  .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax,
                          lstd, avg_Q, format_time(eval_total_time)),
                  flush=True)
    else:
        if args.rank == 0:
            print('Entering main training loop', flush=True)

            if args.output_filename:
                csv_file = open(args.output_filename, 'w', newline='')
                csv_file.write(json.dumps(vars(args)))
                csv_file.write('\n')
                csv_writer = csv.writer(csv_file, delimiter=',')
                csv_writer.writerow(['frames', 'total_time',
                                     'rmean', 'rmedian', 'rstd', 'rmin', 'rmax',
                                     'lmean', 'lmedian', 'lstd', 'lmin', 'lmax'])
            else:
                csv_writer, csv_file = None, None

            if args.plot:
                from tensorboardX import SummaryWriter
                current_time = datetime.now().strftime('%b%d_%H-%M-%S')
                log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname())
                writer = SummaryWriter(log_dir=log_dir)
                for k, v in vars(args).items():
                    writer.add_text(k, str(v))

            # Environment
            print('Initializing environments...', end='', flush=True)
            start_time = time.time()

        if args.use_openai:
            train_env = create_vectorize_atari_env(args.env_name, args.seed, args.num_ales,
                                                   episode_life=True, clip_rewards=args.reward_clip,
                                                   max_frames=args.max_episode_length)
            observation = torch.from_numpy(train_env.reset()).squeeze(1)
        else:
            train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray',
                                 device=env_device, rescale=True,
                                 clip_rewards=args.reward_clip,
                                 episodic_life=True, repeat_prob=0.0)
            train_env.train()
            observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).clone().squeeze(-1)

        if args.rank == 0:
            print('complete ({})'.format(format_time(time.time() - start_time)), flush=True)

        # These variables are used to compute average rewards for all processes.
        episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        has_completed = torch.zeros(args.num_ales, device=train_device, dtype=torch.bool)

        mem = ReplayMemory(args, args.memory_capacity, train_device)
        mem.reset(observation)
        priority_weight_increase = (1 - args.priority_weight) / (args.t_max - args.learn_start)

        state = torch.zeros((args.num_ales, args.history_length, 84, 84), device=mem.device, dtype=torch.float32)
        state[:, -1] = observation.to(device=mem.device, dtype=torch.float32).div(255.0)

        num_frames_per_iter = args.num_ales
        total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter))
        epsilons = np.linspace(args.epsilon_start, args.epsilon_final, math.ceil(args.epsilon_frames / num_frames_per_iter))
        epsilon_offset = math.ceil(args.learn_start / num_frames_per_iter)

        prefetcher = data_prefetcher(args.batch_size, train_device, mem)

        avg_loss = 'N/A'
        eval_offset = 0
        target_update_offset = 0

        total_time = 0

        # main loop
        iterator = range(total_steps)
        if args.rank == 0:
            iterator = tqdm(iterator)

        env_stream = torch.cuda.Stream()
        train_stream = torch.cuda.Stream()

        for update in iterator:

            T = args.world_size * update * num_frames_per_iter
            epsilon = epsilons[min(update - epsilon_offset, len(epsilons) - 1)] if T >= args.learn_start else epsilons[0]
            start_time = time.time()

            if update % args.replay_frequency == 0:
                dqn.reset_noise()  # Draw a new set of noisy weights

            dqn.eval()
            nvtx.range_push('train:select action')
            if args.noisy_linear:
                action = dqn.act(state)  # Choose an action greedily (with noisy weights)
            else:
                action = dqn.act_e_greedy(state, epsilon=epsilon)
            nvtx.range_pop()
            dqn.train()

            if args.use_openai:
                action = action.cpu().numpy()

            torch.cuda.synchronize()

            with torch.cuda.stream(env_stream):
                nvtx.range_push('train:env step')
                observation, reward, done, info = train_env.step(action)  # Step

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation).squeeze(1)
                    reward = torch.from_numpy(reward.astype(np.float32))
                    done = torch.from_numpy(done.astype(np.bool))
                    action = torch.from_numpy(action)
                else:
                    observation = observation.clone().squeeze(-1)
                nvtx.range_pop()

                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device)
                done = done.to(device=train_device, dtype=torch.bool)
                action = action.to(device=train_device)

                observation = observation.float().div_(255.0)
                not_done = 1.0 - done.float()

                state[:, :-1].copy_(state[:, 1:].clone())
                state *= not_done.view(-1, 1, 1, 1)
                state[:, -1].copy_(observation)

                # update episodic reward counters
                has_completed |= done

                episode_rewards += reward.float()
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done

            # Train and test
            if T >= args.learn_start:
                mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)  # Anneal importance sampling weight β to 1
                prefetcher.preload()

                avg_loss = 0.0
                num_minibatches = min(int(args.num_ales / args.replay_frequency), 8)
                for _ in range(num_minibatches):
                    # Sample transitions
                    nvtx.range_push('train:sample states')
                    idxs, states, actions, returns, next_states, nonterminals, weights = prefetcher.next()
                    nvtx.range_pop()

                    nvtx.range_push('train:network update')
                    loss = dqn.learn(states, actions, returns, next_states, nonterminals, weights)
                    nvtx.range_pop()

                    nvtx.range_push('train:update priorities')
                    mem.update_priorities(idxs, loss)  # Update priorities of sampled transitions
                    nvtx.range_pop()

                    avg_loss += loss.mean().item()
                avg_loss /= num_minibatches

                # Update target network
                if T >= target_update_offset:
                    dqn.update_target_net()
                    target_update_offset += args.target_update

            torch.cuda.current_stream().wait_stream(env_stream)
            torch.cuda.current_stream().wait_stream(train_stream)

            nvtx.range_push('train:append memory')
            mem.append(observation, action, reward, done)  # Append transition to memory
            nvtx.range_pop()

            total_time += time.time() - start_time

            if args.rank == 0:
                if args.plot and ((update % args.replay_frequency) == 0):
                    writer.add_scalar('train/epsilon', epsilon, T)
                    writer.add_scalar('train/rewards', final_rewards.mean(), T)
                    writer.add_scalar('train/lengths', final_lengths.mean(), T)

                if T >= eval_offset:
                    eval_start_time = time.time()
                    dqn.eval()  # Set DQN (online network) to evaluation mode
                    rewards, lengths, avg_Q = test(args, T, dqn, val_mem, test_env, train_device)
                    dqn.train()  # Set DQN (online network) back to training mode
                    eval_total_time = time.time() - eval_start_time
                    eval_offset += args.evaluation_interval

                    rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards)
                    lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths)

                    print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                          'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                          'Avg. Q: {:4.4f} | {}'
                          .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax,
                                  lstd, avg_Q, format_time(eval_total_time)),
                          flush=True)

                    if args.output_filename and csv_writer and csv_file:
                        csv_writer.writerow([T, total_time,
                                             rmean, rmedian, rstd, rmin, rmax,
                                             lmean, lmedian, lstd, lmin, lmax])
                        csv_file.flush()

                    if args.plot:
                        writer.add_scalar('eval/rewards', rmean, T)
                        writer.add_scalar('eval/lengths', lmean, T)
                        writer.add_scalar('eval/avg_Q', avg_Q, T)

                loss_str = '{:4.4f}'.format(avg_loss) if isinstance(avg_loss, float) else avg_loss
                progress_data = 'T = {:,} epsilon = {:4.2f} avg reward = {:4.2f} loss: {}' \
                                .format(T, epsilon, final_rewards.mean().item(), loss_str)
                iterator.set_postfix_str(progress_data)

    if args.plot and (args.rank == 0):
        writer.close()

    if args.use_openai:
        train_env.close()
        test_env.close()
Exemplo n.º 7
0
def train(args, env):
    action_space = env.action_space.n
    print("show action space", action_space)
    print("state space", env.observation_space)
    # Agent
    dqn_1 = Agent(args, env)
    dqn_2 = Agent(args, env)

    results_dir = os.path.join('results', args.id)
    print("result dir", results_dir)

    T, done = 0, True
    # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory
    print(" ags training", args.continue_training)

    args.continue_training = False
    if args.continue_training:
        print("Continue Training Load buffer 1 ...")

        args.memory = results_dir + "/val_mem_1/memory.pkl"
        mem_1 = load_memory(args.memory, args.disable_bzip_memory)
        val_mem_1 = ReplayMemory(args, args.evaluation_size)
        print("loaded memory buffer 1")
        print("Continue Training Load buffer 2 ...")
        args.memory = results_dir + "/val_mem_2/memory.pkl"
        mem_2 = load_memory(args.memory, args.disable_bzip_memory)
        val_mem_2 = ReplayMemory(args, args.evaluation_size)
        print("loaded memory buffer 2")

    else:
        print("use empty Buffers")
        args.memory = results_dir + "/val_mem_1/memory.pkl"
        path = results_dir + "/val_mem_1"
        print("save memory", args.memory)
        os.makedirs(path, exist_ok=True)
        val_mem_1 = ReplayMemory(args, args.evaluation_size)
        mem_1 = ReplayMemory(args, args.memory_capacity)
        args.memory = results_dir + "/val_mem_2/memory.pkl"
        path = results_dir + "/val_mem_2"
        print("save memory", args.memory)
        os.makedirs(path, exist_ok=True)
        val_mem_2 = ReplayMemory(args, args.evaluation_size)
        mem_2 = ReplayMemory(args, args.memory_capacity)

    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)
    metrics = {
        'steps': [],
        'rewards': [],
        'Qs': [],
        'step_rewards': [],
        'train_rewards': [],
        'best_avg_reward': -float('inf')
    }

    args.continue_training = True

    def write_into_file(text, file_name='document.csv'):
        """
        """
        with open(file_name, 'a', newline='\n') as fd:
            fd.write(str(text) + "\n")

    def log(s):
        text = '[' + str(
            datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s
        write_into_file(text)
        print(text)

    if torch.cuda.is_available():
        print("cuda")

    def save_memory(memory, memory_path, disable_bzip):
        if disable_bzip:
            with open(memory_path, 'wb') as pickle_file:
                pickle.dump(memory, pickle_file)
        else:
            with bz2.open(memory_path, 'wb') as zipped_pickle_file:
                pickle.dump(memory, zipped_pickle_file)

    ("Create eval memory of size {} ".format(args.evaluation_size))
    # Construct validation memory

    size = 84
    print("Fill eval memory")

    # fill both memories at same time
    # use the reward function for each
    try:
        while T < args.evaluation_size:
            T += 1
            print("steps ", T)
            if done:
                t = 0
                done = False
                state = env.reset()
                state = torch.tensor(state,
                                     dtype=torch.float32,
                                     device=args.device).div_(255)
                zeros = torch.zeros_like(state)
                state_buffer = deque([], maxlen=args.history_length)
                state_buffer.append(zeros)
                state_buffer.append(zeros)
                state_buffer.append(zeros)
                state_buffer.append(state)
                state = torch.stack(list(state_buffer), 0)
            t += 1
            if t == args.max_episode_length:
                #if t == 5:
                t = 0
                done = True
            next_state, _, _, _ = env.step(np.random.randint(0, action_space))

            val_mem_1.append(state, None, None, done)
            val_mem_2.append(state, None, None, done)

            next_state = torch.tensor(next_state,
                                      dtype=torch.float32,
                                      device=args.device).div_(255)
            state_buffer.append(next_state)
            state = torch.stack(list(state_buffer), 0)
        eps_1 = 1
        eps_end_1 = 0.05
        eps_decay_1 = 0.999978  # reaches 10% at 105000

        eps_2 = 1
        eps_end_2 = 0.05
        eps_decay_2 = 0.999978  # reaches 10% at 10500
        #args.evaluate = True
        if args.evaluate:
            print("Test")
            dqn.eval()  # Set DQN (online network) to evaluation mode
            #avg_reward, avg_Q = test(args, 0, dqn, val_mem, metrics, results_dir, env, evaluate=True)  # Test
            avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics,
                                     results_dir, env)  # Test
            print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                  str(avg_Q))
        else:
            if args.continue_training:
                print("Start Training")
                T = args.learn_start + 500
            # Training loop
            dqn_1.train()
            dqn_2.train()
            episode = 0
            episode_reward = 0
            mean_reward = deque(maxlen=100)
            plot_rewards = []
            print("Fill both memory buffers ")
            while T < args.learn_start:
                if T % args.max_episode_length == 0:
                    state, done = env.reset(), False
                    state = torch.tensor(state,
                                         dtype=torch.float32,
                                         device=args.device).div_(255)
                    zeros = torch.zeros_like(state)
                    state_buffer = deque([], maxlen=args.history_length)
                    state_buffer.append(zeros)
                    state_buffer.append(zeros)
                    state_buffer.append(zeros)
                    state_buffer.append(state)
                    state = torch.stack(list(state_buffer), 0)
                # choose action at random
                action = np.random.randint(0, action_space)
                next_state, reward, done, reward_2 = env.step(action)  # Step
                text = "Step {} of {} ".format(T, args.learn_start)
                print(text, end='\r', file=sys.stdout, flush=True)
                # set done on the last transition
                if (T + 1) % args.max_episode_length == 0:
                    done = True
                mem_1.append(state, action, reward, done)
                mem_2.append(state, action, reward_2, done)
                next_state = torch.tensor(next_state,
                                          dtype=torch.float32,
                                          device=args.device).div_(255)
                state_buffer.append(next_state)
                state = torch.stack(list(state_buffer), 0)
                T += 1
                if T >= args.learn_start:
                    args.memory = results_dir + "/val_mem_1/memory.pkl"
                    print("save memory 1", args.memory)
                    save_memory(mem_1, args.memory, args.disable_bzip_memory)
                    args.memory = results_dir + "/val_mem_2/memory.pkl"
                    print("save memory 2", args.memory)
                    save_memory(mem_2, args.memory, args.disable_bzip_memory)
                    break
            print("Start Training")
            #for T in tqdm.trange(args.learn_start, args.T_max + 1):
            for T in tqdm.trange(0, args.T_max + 1):
                if T % args.max_episode_length == 0:
                    mean_reward.append(episode_reward)
                    print("Epiosde: {}  Reward: {} Mean Reward: {}  Goal1 {}".
                          format(episode, episode_reward, np.mean(mean_reward),
                                 env.goal_counter_1))
                    plot_rewards.append(np.mean(mean_reward))
                    save_and_plot(T, plot_rewards)
                    episode_reward = 0
                    episode += 1
                    state, done = env.reset(), False
                    state = torch.tensor(state,
                                         dtype=torch.float32,
                                         device=args.device).div_(255)
                    zeros = torch.zeros_like(state)
                    state_buffer = deque([], maxlen=args.history_length)
                    state_buffer.append(zeros)
                    state_buffer.append(zeros)
                    state_buffer.append(zeros)
                    state_buffer.append(state)
                    state = torch.stack(list(state_buffer), 0)
                    g = 0
                    set_input = True
                    secondTask = False

                if T % args.replay_frequency == 0:
                    pass
                    #dqn.reset_noise()  # Draw a new set of noisy weights
                """
                if env.task_one_complete or secondTask:
                    action = dqn_2.act_e_greedy(state, eps_2)  # Choose an action greedily (with noisy weights)
                    secondTask = True
                else:
                    action = dqn_1.act_e_greedy(state, eps_1)  # Choose an action greedily (with noisy weights)
                """
                if set_input:
                    set_input = False
                    g = input("Enter action : ")
                    action = int(g)
                    g = input("Enter steps : ")
                    g = int(g)
                if g <= 0:
                    set_input = True
                g -= 1

                #print("step : {} action: {} eps: {}".format(T, action, eps))
                next_state, reward, done, reward_2 = env.step(action)  # Step

                if args.reward_clip > 0:
                    reward = max(min(reward, args.reward_clip),
                                 -args.reward_clip)  # Clip rewards
                    reward_2 = max(min(reward_2, args.reward_clip),
                                   -args.reward_clip)  # Clip rewards

                if env.task_one_complete or secondTask:
                    episode_reward += reward_2
                    eps_2 = max(eps_end_2, eps_decay_2 * eps_2)
                    mem_2.priority_weight = min(
                        mem_2.priority_weight + priority_weight_increase,
                        1)  # Anneal importance sampling weight β to 1
                else:
                    episode_reward += reward
                    eps_1 = max(eps_end_1, eps_decay_1 * eps_1)
                    mem_1.priority_weight = min(
                        mem_1.priority_weight + priority_weight_increase,
                        1)  # Anneal importance sampling weight β to 1

                #print(reward)
                #print(reward_2)
                # incase the last action set done to True
                if T + 1 % args.max_episode_length == 0:
                    done = True

                mem_1.append(state, action, reward,
                             done)  # Append transition to memory
                mem_2.append(state, action, reward_2,
                             done)  # Append transition to memory

                # Train and test

                next_state = torch.tensor(next_state,
                                          dtype=torch.float32,
                                          device=args.device).div_(255)
                # print("Main shape of  next_state", next_state.shape)
                state_buffer.append(next_state)
                state = torch.stack(list(state_buffer), 0)
                continue
                # print("Main shape of  state", state.shape)
                if T % args.replay_frequency == 0:
                    dqn_1.learn(
                        mem_1
                    )  # Train with n-step distributional double-Q learning
                    dqn_2.learn(
                        mem_2
                    )  # Train with n-step distributional double-Q learning

                if T % args.evaluation_interval == 0:
                    dqn_1.eval()  # Set DQN (online network) to evaluation mode
                    print("Eval epsilon 1 {} epsilon 2 {} ".format(
                        eps_1, eps_2))
                    avg_reward, avg_Q = test(args, T, dqn_1, val_mem_1,
                                             metrics, results_dir, env,
                                             1)  # Test
                    log('T = ' + str(T) + ' / ' + str(args.T_max) +
                        ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                        str(avg_Q))
                    dqn_1.train(
                    )  # Set DQN (online network) back to training mode
                    dqn_2.eval()  # Set DQN (online network) to evaluation mode
                    avg_reward, avg_Q = test(args, T, dqn_2, val_mem_2,
                                             metrics, results_dir, env,
                                             2)  # Test
                    log('T = ' + str(T) + ' / ' + str(args.T_max) +
                        ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                        str(avg_Q))
                    dqn_2.train(
                    )  # Set DQN (online network) back to training mode

                # Update target network
                if T % args.target_update == 0:
                    dqn_1.update_target_net()
                    dqn_2.update_target_net()

                # checkpoint the network
                if (args.checkpoint_interval !=
                        0) and (T % args.checkpoint_interval == 0):
                    #print("save memory", args.memory)
                    #save_memory(mem, args.memory, args.disable_bzip_memory)
                    print("epsilon 1: ", eps_1)
                    print("epsilon 2: ", eps_2)
                    print("Save model at ", results_dir)
                    dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T))
                    dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T))
    except KeyboardInterrupt:
        print("Keybaord error")
    finally:
        print("save state....")
        print("Save model at ", results_dir)
        dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T))
        dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T))
        args.memory = results_dir + "/val_mem_1/memory.pkl"
        print("save memory 1  ...", args.memory)
        save_memory(mem_1, args.memory, args.disable_bzip_memory)
        args.memory = results_dir + "/val_mem_2/memory.pkl"
        print("save memory 2 ...", args.memory)
        save_memory(mem_2, args.memory, args.disable_bzip_memory)
        print("Save model at ", results_dir)
        dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T))
        dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T))
        print("... done Saving State")
        sys.exit()