def train_tank_1_player_machine_lstm(): game_engine = TankBattle(render=False, player1_human_control=False, player2_human_control=False, two_players=False, speed=1000, frame_skip=5, debug=False ) env = FruitEnvironment(game_engine, max_episode_steps=10000, state_processor=AtariProcessor(), multi_objective=False) network_config = AtariA3CLSTMConfig(env, initial_learning_rate=0.004) network = PolicyNetwork(network_config, num_of_checkpoints=40, using_gpu=True) agent = A3CLSTMAgent(network, env, num_of_epochs=10, steps_per_epoch=1e6, save_frequency=5e5, update_network_frequency=4, log_dir='./train/nips/TankBattle/a3c_gpu_8_threads_tank_time_based_10_lstm_lr_0004', num_of_threads=8) agent.train()
def evaluate_tank_1_player_machine(): game_engine = TankBattle(render=True, player1_human_control=False, player2_human_control=False, two_players=False, speed=60, frame_skip=5, debug=False ) env = FruitEnvironment(game_engine, max_episode_steps=10000, state_processor=AtariProcessor(), multi_objective=False) network_config = AtariA3CConfig(env) network = PolicyNetwork(network_config, using_gpu=True, load_model_path='./train/nips/TankBattle/a3c_gpu_8_threads_tank_time_based_10_lr_0004_04-10-2018-16-27/model-9500578' ) agent = A3CAgent(network, env, num_of_epochs=1, steps_per_epoch=100000, report_frequency=1, log_dir='./thi_test/nips/TankBattle/a3c_gpu_8_threads_tank_time_based_30_49_lr_0004', num_of_threads=1) agent.evaluate()
def train_mc_grid_world(): engine = GridWorld(render=False, graphical_state=False, stage=1, number_of_rows=8, number_of_columns=9, speed=1000, seed=100, agent_start_x=2, agent_start_y=2) environment = FruitEnvironment(game_engine=engine) agent = AgentFactory.create(MCLearner, network=None, environment=environment, checkpoint_frequency=1e5, num_of_epochs=1, steps_per_epoch=1e5, learner_report_frequency=10, log_dir='./train/grid_world/mc_checkpoints') agent.train()
def train_multi_objective_dqn_agent(is_linear=True, extended_config=True): if extended_config: # Create a Deep Sea Treasure game game = DeepSeaTreasure(graphical_state=True, width=5, seed=100, render=False, max_treasure=100, speed=1000) # Put game into fruit wrapper environment = FruitEnvironment(game, max_episode_steps=60, state_processor=AtariProcessor()) else: # Create a Deep Sea Treasure game game = DeepSeaTreasure(graphical_state=False, width=5, seed=100, render=False, max_treasure=100, speed=1000) # Put game into fruit wrapper environment = FruitEnvironment(game, max_episode_steps=60) # Get treasures treasures = game.get_treasure() if is_linear: tlo_thresholds = None linear_thresholds = [1, 0] else: tlo_thresholds = [(treasures[4] + treasures[3]) / 2] linear_thresholds = [10, 1] if extended_config: config = MOExDQNConfig(environment, is_linear=is_linear, linear_thresholds=linear_thresholds, tlo_thresholds=tlo_thresholds, using_cnn=True, history_length=4) else: config = MODQNConfig(environment, is_linear=is_linear, linear_thresholds=linear_thresholds, tlo_thresholds=tlo_thresholds) # Create a shared policy network network = PolicyNetwork(config, max_num_of_checkpoints=10) # Create a multi-objective DQN agent agent = AgentFactory.create(MODQNLearner, network, environment, num_of_epochs=2, steps_per_epoch=100000, checkpoint_frequency=50000, log_dir='./train/deep_sea_treasure/mo_dqn_checkpoints') # Train it agent.train()
def train_multi_objective_agent_deep_sea_treasure(env_size): # Create a Deep Sea Treasure game = DeepSeaTreasure(width=env_size, seed=100, speed=1000) # Put the game engine into fruit wrapper environment = FruitEnvironment(game) # Create a multi-objective agent using Q-learning agent = AgentFactory.create(MOQLearner, None, environment, num_of_epochs=2, steps_per_epoch=100000, checkpoint_frequency=5e4, log_dir='./train/deep_sea_treasure/moq_checkpoints') # Train it agent.train()
def train_multi_objective_agent_mountain_car(): # Create a Mountain Car game game = MountainCar(graphical_state=False, frame_skip=1, render=False, speed=1000, is_debug=False) # Put game into fruit wrapper and enable multi-objective feature environment = FruitEnvironment(game) # Create a multi-objective agent using Q-learning algorithm agent = AgentFactory.create(MOQLearner, None, environment, num_of_epochs=30, steps_per_epoch=100000, checkpoint_frequency=1e5, log_dir='./train/mountain_car/moq_checkpoints', is_linear=True, thresholds=[0.5, 0.3, 0.2]) # Train the agent agent.train()
def eval_mc_grid_world(): engine = GridWorld(render=True, graphical_state=False, stage=1, number_of_rows=8, number_of_columns=9, speed=2, seed=100, agent_start_x=2, agent_start_y=2) environment = FruitEnvironment(game_engine=engine) agent = AgentFactory.create(MCLearner, network=None, environment=environment, checkpoint_frequency=1e5, num_of_epochs=1, steps_per_epoch=1e4, learner_report_frequency=50, log_dir='./test/grid_world/mc_checkpoints', load_model_path='./train/grid_world/mc_checkpoints_11-02-2019-02-29/' 'checkpoint_100315.npy', epsilon_annealing_start=0) agent.evaluate()
def train_tank_1_player_machine(): game_engine = TankBattle(render=False, player1_human_control=False, player2_human_control=False, two_players=False, speed=2000, frame_skip=5) env = FruitEnvironment(game_engine, max_episode_steps=10000, state_processor=AtariProcessor(), reward_processor=TankBattleTotalRewardProcessor()) network_config = AtariA3CConfig(env, initial_learning_rate=0.004) network = PolicyNetwork(network_config, max_num_of_checkpoints=20) agent = AgentFactory.create(A3CLearner, network, env, num_of_epochs=10, steps_per_epoch=1e6, checkpoint_frequency=5e5, log_dir='./train/tank_battle/a3c_checkpoints') agent.train()
def train_milk_1_milk_1_fix_robots_with_no_status(): game_engine = MilkFactory(render=False, speed=6000, max_frames=200, frame_skip=1, number_of_milk_robots=1, number_of_fix_robots=1, number_of_milks=1, seed=None, human_control=False, error_freq=0.03, human_control_robot=0, milk_speed=3, debug=False, action_combined_mode=False, show_status=False) environment = FruitEnvironment(game_engine, max_episode_steps=200, state_processor=AtariProcessor()) network_config = MAA3CConfig(environment, initial_learning_rate=0.001, beta=0.001) network = PolicyNetwork(network_config, max_num_of_checkpoints=40) agent = AgentFactory.create(MAA3CLearner, network, environment, num_of_epochs=40, steps_per_epoch=1e5, checkpoint_frequency=1e5, log_dir='./train/milk_factory/a3c_ma_2_checkpoints') agent.train()
def train_tank_1_player_machine_with_map(): def update_reward(rewards): return rewards[2] game_engine = TankBattle(render=False, player1_human_control=False, player2_human_control=False, two_players=False, speed=1000, frame_skip=5, debug=False, using_map=True, num_of_enemies=5, multi_target=True, strategy=3 ) env = FruitEnvironment(game_engine, max_episode_steps=10000, state_processor=AtariProcessor()) network_config = A3CMapConfig(env, initial_learning_rate=0.004) network = PolicyNetwork(network_config, max_num_of_checkpoints=20) agent = AgentFactory.create(A3CMapLearner, network, env, num_of_epochs=10, steps_per_epoch=1e6, checkpoint_frequency=5e5, log_dir='./train/tank_battle/a3c_map_checkpoints', network_update_steps=4, update_reward_fnc=update_reward) agent.train()
def evaluate_tank_1_player_machine_with_map(): game_engine = TankBattle(render=True, player1_human_control=False, player2_human_control=False, two_players=False, speed=200, frame_skip=5, debug=False, num_of_enemies=5, using_map=True, multi_target=True, strategy=3 ) env = FruitEnvironment(game_engine, max_episode_steps=10000, state_processor=AtariProcessor(), multi_objective=True) network_config = NIPSA3CConfig(env) network = NIPSPolicyNetwork(network_config, using_gpu=True, load_model_path='./train/nips/TankBattle/a3c_gpu_8_threads_tank_with_map_time_based_10_lr_0004_5_enemy_s3_multi_target_04-18-2018-21-16/model-9500310' ) agent = NIPSA3CAgent(network, env, num_of_epochs=1, steps_per_epoch=10000, report_frequency=1, log_dir='./thi_test/nips/tankbattle/a3c_gpu_8_threads_tank_with_10_lr_0004', num_of_threads=1) a3c_reward = agent.evaluate() print(a3c_reward) mr_a3c = np.mean([x[0] for x in a3c_reward]) s_a3c = np.mean([x[2] for x in a3c_reward]) print("Mean", mr_a3c, s_a3c)