# We patch the environment to be closer to what Mnih et al. actually do: The environment # repeats the action 4 times and a game is considered to be over during training as soon as a live # is lost. def _step(a): reward_1, reward_2 = 0.0, 0.0 action = env._action_set[a] lives_before = env.ale.lives() for _ in range(4): reward += env.ale.act(action) ob = env._get_obs() done = env.ale.game_over() or (args.mode == 'train' and lives_before != env.ale.lives()) return ob, reward, done, {} env._step = _step # Next, we build our model. We use the same model that was described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model_1 = Sequential() if K.image_dim_ordering() == 'tf': # (width, height, channels) model_1.add(Permute((2, 3, 1), input_shape=input_shape)) elif K.image_dim_ordering() == 'th': # (channels, width, height) model_1.add(Permute((1, 2, 3), input_shape=input_shape)) else: raise RuntimeError('Unknown image_dim_ordering.') model_1.add(Convolution2D(32, 8, 8, subsample=(4, 4))) model_1.add(Activation('relu')) model_1.add(Convolution2D(64, 4, 4, subsample=(2, 2)))