return VanillaPolicyGradient("model", discount_factor, learning_rate_policy, learning_rate_advantage, value_steps_per_update, config, lambda_parameter) def _define_agent(model: VanillaPolicyGradient) -> VPGRNGDiscreteAgent: # Define attributes updates_per_training_volley: int = 2 # Return the agent return VPGRNGDiscreteAgent("vpg_agent", model, updates_per_training_volley) if __name__ == "__main__": # Parse the command line arguments workspace_path, experiment_iterations_number, cuda_devices, render_during_training, render_during_validation, render_during_test = command_line_parse( ) # Define the CUDA devices in which to run the experiment os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = cuda_devices # Define the logger logger: logging.Logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # Define Neural Network layers nn_config: Config = Config() nn_config.add_hidden_layer(LayerType.dense, [ 4096, tensorflow.nn.relu, True, tensorflow.contrib.layers.xavier_initializer() ]) nn_config.add_hidden_layer(LayerType.dense, [ 4096, tensorflow.nn.relu, True, tensorflow.contrib.layers.xavier_initializer()
exploration_rate_decay: float = 0.00002 # Return the explorer return EpsilonGreedyExplorationPolicy(exploration_rate_max, exploration_rate_min, exploration_rate_decay) def _define_epsilon_greedy_agent(model: DuelingDeepQLearning, exploration_policy: EpsilonGreedyExplorationPolicy) -> DDDQLTicTacToeAgent: # Define attributes weight_copy_step_interval: int = 100 batch_size: int = 150 # Return the agent return DDDQLTicTacToeAgent("dddqn_egreedy_agent", model, exploration_policy, weight_copy_step_interval, batch_size) if __name__ == "__main__": # Parse the command line arguments checkpoint_path, iteration_number, cuda_devices, render = command_line_parse(True) # Define the CUDA devices in which to run the experiment os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = cuda_devices # Define the logger logger: logging.Logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # Tic Tac Toe random environment: # - success threshold to consider both the training completed and the experiment successful is around 95% of match won by the agent (depending on reward assigned) environment_name: str = 'TicTacToeRandom' # Generate Tic Tac Toe environments with random environment player and using the O player as the environment player only with low reward type environment_low_reward: TicTacToeEnvironmentRandom = TicTacToeEnvironmentRandom(environment_name, Player.o, 1.0, -0.1, 0.0) # Define Neural Network layers nn_config: Config = Config() nn_config.add_hidden_layer(LayerType.dense, [1024, tensorflow.nn.relu, True, tensorflow.contrib.layers.xavier_initializer()])