def train(env_id, num_timesteps, num_cpu):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(SEED + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env
        return _thunk

    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    learn(CNN, env, SEED, total_timesteps=int(num_timesteps * 1.1))
    env.close()
    pass
def train(env_id, num_timesteps, num_cpu):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(SEED + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env

        return _thunk

    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    learn(CNN, env, SEED, total_timesteps=int(num_timesteps * 1.1))
    env.close()
    pass
Пример #3
0
def main(env_id, num_timesteps, seed, policy, nstack, nsteps, lrschedule,
         optimizer, num_cpu, model_file, use_static_wrapper,
         use_encoded_imagination, use_decoded_imagination):
    num_timesteps //= 4
    assert not (use_encoded_imagination and use_decoded_imagination)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            if use_static_wrapper:
                env = StaticWrapper(env)
            if policy == 'cnn' or use_encoded_imagination:
                env = RenderWrapper(env, 400, 600)
                env = DownsampleWrapper(env, 4)
            if use_encoded_imagination or use_decoded_imagination:
                env = FrameStack(env, 3)
            if use_encoded_imagination:
                env = EncodedImaginationWrapper(env, model_file, num_cpu)
            if use_decoded_imagination:
                env = DecodedImaginationWrapper(env, model_file, num_cpu)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])

    if policy == 'fc':
        policy_fn = FcPolicy
    if policy == 'cnn':
        policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          nsteps=nsteps,
          nstack=nstack,
          total_timesteps=num_timesteps,
          lrschedule=lrschedule,
          optimizer=optimizer,
          max_episode_length=195)
    env.close()
Пример #4
0
def main():

    cumulative_avg_rewards = []
    for seed_ in [10, 50, 100, 200, 500]:
        seed(seed_)
        set_random_seed(seed_)
        print("Seed: ", seed_)
        episode = 0

        # initialize environment
        env_id = get_args().env
        #env = make_atari(env_id)
        #env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=False)
        #env = Monitor(env)

        env = SubprocVecEnv([make_env(seed_, i) for i in range(6)])  #24
        print("CHECK_ENV", env.reset().__array__().shape)
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.n
        agent = get_agent(env)
        save_path = os.path.join('models_entropy_coeff1',
                                 "Space_inv_A2C_LSTM_nstep8_MAX_rew_546")
        agent.load(save_path)
        lstm_state = np.zeros((6, 256), dtype=np.float32)  #24

        # run for 100 episodes
        #for i in range(100):
        counter = 0

        episodic_reward_lis = []
        for i in range(wandb.config.episodes):
            # Set reward received in this episode = 0 at the start of the episode
            episodic_reward = np.zeros((6))  #24
            episodic_reward_m = np.zeros((6))  #24

            reset = False

            #env = gym.wrappers.Monitor(env, 'test/'+str(i), force=True)

            obs = env.reset()
            renders = []
            count = 0
            action_count = 0
            done = False
            done1 = np.zeros(6)  #24
            done2 = np.zeros(6)  #24
            while not done:
                a, v, lstm_state = agent.step(obs, S_=lstm_state, M_=done1)
                obs, reward, done1, info = env.step(a, done1, cond="eval")
                done = done2.all()
                if (done):
                    episodic_reward_m1 = episodic_reward_m.max()
                    break
                if (done1.any()):
                    episodic_reward_m[np.logical_and(
                        done2 <= 0, done1)] = episodic_reward[np.logical_and(
                            done2 <= 0, done1)]
                    for j in np.nonzero(done1)[0]:
                        episodic_reward[j] = 0
                episodic_reward += reward
                done2 = np.logical_or(done1, done2)

            if (i == 0):
                reset = True

            cumulative_avg_reward = evaluate(episodic_reward_m1, reset)

        tf.reset_default_graph()
        env.close()

        # your models will be evaluated on 100-episode average reward
        # therefore, we stop logging after 100 episodes
        print("*************************************************************")
        print("CUMULATIVE_AVG_REWARD", cumulative_avg_reward)
        print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
        cumulative_avg_rewards.append(cumulative_avg_reward)

    print("Final score: ", np.mean(cumulative_avg_rewards))
Пример #5
0
def runTrain(gymId='BreakoutNoFrameskip-v4',
             numEnvs=16,
             seed=0,
             filePathBrain='training/breakout-v1.pth',
             numSteps=5,
             numBatches=20000,
             outputBatchInterval=1000,
             joinEnvs=1,
             epsilon=0.00001):
    def make_env(rank):
        def _thunk():
            env = make_atari(gymId)
            env.seed(seed + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env

        return _thunk

    print('training starting', numBatches, outputBatchInterval, 'epsilon',
          epsilon)
    env = SubprocVecEnv([make_env(i) for i in range(numEnvs)])

    numActions = env.action_space.n

    torchDevice = 'cpu'
    if torch.cuda.is_available():
        torchDevice = 'cuda'
    agent = ai_a2c.A2C(numActions, device=torchDevice)
    if filePathBrain:
        agent.load(filePath=filePathBrain)

    timingStart = date_time.now()
    batchCount = 0

    states, actions, rewards, dones, values = [], [], [], [], []
    for ii in range(numEnvs):
        states.append([])
        actions.append([])
        rewards.append([])
        dones.append([])
        values.append([])

    # Set first state.
    # Environment returns 1 frame, but we want multiple, so we stack the new
    # state on top of the past ones.
    nh, nw, nc = env.observation_space.shape
    nstack = 4
    batchStateShape = (numEnvs * numSteps, nh, nw, nc * nstack)
    emptyState = np.zeros((numEnvs, nh, nw, nc * nstack), dtype=np.uint8)
    obs = env.reset()
    # states = updateState(obs, emptyState, nc)
    lastStates = updateState(obs, emptyState, nc)
    lastDones = [False for _ in range(numEnvs)]

    totalRewards = []
    realTotalRewards = []
    # All actions are always valid.
    validActions = [0, 1, 2, 3]

    while batchCount < numBatches:
        states, actions, rewards, dones, values = [], [], [], [], []
        stepCount = 0
        while stepCount < numSteps:
            actionsStep, valuesStep = agent.selectActions(
                lastStates, validActions=validActions, randomRatio=epsilon)
            # print ('actionsStep', actionsStep)
            states.append(np.copy(lastStates))
            actions.append(actionsStep)
            values.append(valuesStep)
            if stepCount > 0:
                dones.append(lastDones)

            # Input the action (run a step) for all environments.
            statesStep, rewardsStep, donesStep, infosStep = env.step(
                actionsStep)

            # Update state for any dones.
            for n, done in enumerate(donesStep):
                if done:
                    lastStates[n] = lastStates[n] * 0
            lastStates = updateState(obs, lastStates, nc)

            # Update rewards for logging / tracking.
            for done, info in zip(donesStep, infosStep):
                if done:
                    totalRewards.append(info['reward'])
                    if info['total_reward'] != -1:
                        realTotalRewards.append(info['total_reward'])

            lastDones = donesStep
            rewards.append(rewardsStep)

            stepCount += 1

        # Dones is one off, so add the last one.
        dones.append(lastDones)

        # discount/bootstrap off value fn
        # lastValues = self.agent.value(lastStates).tolist()
        # Can skip this as it is done in the learn function with calcActualStateValues?

        # Join all (combine batches and steps).
        states = np.asarray(states, dtype='float32').swapaxes(
            1, 0).reshape(batchStateShape)
        actions = np.asarray(actions).swapaxes(1, 0).flatten()
        rewards = np.asarray(rewards).swapaxes(1, 0).flatten()
        dones = np.asarray(dones).swapaxes(1, 0).flatten()
        values = np.asarray(values).swapaxes(1, 0).flatten()
        agent.learn(states, actions, rewards, dones, values)

        batchCount += 1

        if batchCount % outputBatchInterval == 0:
            runTime = date_time.diff(date_time.now(), timingStart, 'minutes')
            totalSteps = batchCount * numSteps
            runTimePerStep = runTime / totalSteps
            runTimePerStepUnit = 'minutes'
            if runTimePerStep < 0.02:
                runTimePerStep *= 60
                runTimePerStepUnit = 'seconds'
            print(batchCount, numBatches, '(batch done)',
                  number.toFixed(runTime), 'run time minutes,', totalSteps,
                  'steps,', number.toFixed(runTimePerStep), runTimePerStepUnit,
                  'per step')

            r = totalRewards[-100:]  # get last 100
            tr = realTotalRewards[-100:]
            if len(r) == 100:
                print("avg reward (last 100):", np.mean(r))
            if len(tr) == 100:
                print("avg total reward (last 100):", np.mean(tr))
                print("max (last 100):", np.max(tr))

            # Only save periodically as well.
            if filePathBrain:
                agent.save(filePathBrain)

    env.close()

    if filePathBrain:
        agent.save(filePathBrain)

    runTime = date_time.diff(date_time.now(), timingStart, 'minutes')
    totalSteps = numBatches * numSteps
    runTimePerStep = runTime / totalSteps
    runTimePerStepUnit = 'minutes'
    if runTimePerStep < 0.02:
        runTimePerStep *= 60
        runTimePerStepUnit = 'seconds'
    print('training done:', number.toFixed(runTime),
          'run time minutes,', totalSteps, 'steps,',
          number.toFixed(runTimePerStep), runTimePerStepUnit, 'per step')

    return None