i += 1 else: self.total_score += env.act(2) from deer.agent import NeuralAgent from deer.learning_algos.q_net_keras import MyQNetwork from deer.learning_algos.AC_net_keras import MyACNetwork from deer.policies import EpsilonGreedyPolicy rng = np.random.RandomState(123456) # TODO : best algorithm, hyperparameter tuning if args.network == 'DQN': network = MyQNetwork(environment=env, batch_size=32, double_Q=True, random_state=rng) elif args.network == 'DDPG': network = MyACNetwork(environment=env, batch_size=32, random_state=rng) agent = NeuralAgent(env, network, train_policy=EpsilonGreedyPolicy(network, env.nActions(), rng, 0.0), replay_memory_size=1000, batch_size=32, random_state=rng) #agent.attach(bc.VerboseController()) if args.fname == 'baseline': agent = EmpiricalTreatmentAgent(env)
if __name__ == "__main__": logging.basicConfig(level=logging.INFO) # --- Parse parameters --- parameters = process_args(sys.argv[1:], Defaults) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() # --- Instantiate environment --- env = Toy_env(rng) # --- Instantiate qnetwork --- qnetwork = MyQNetwork(env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng) train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.1) test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.) # --- Instantiate agent --- agent = NeuralAgent(env, qnetwork, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, train_policy=train_policy, test_policy=test_policy)
from deer.learning_algos.q_net_keras import MyQNetwork from deer.learning_algos.AC_net_keras import MyACNetwork import deer.experiment.base_controllers as bc from deer.policies import EpsilonGreedyPolicy from misc.other_controllers import GaussianNoiseController from misc.GaussianNoiseExplorationPolicy import GaussianNoiseExplorationPolicy env = CellEnvironment(args.obs_type, args.resize, args.reward, args.network, args.special) rng = np.random.RandomState(777) # TODO : best algorithm, hyperparameter tuning if args.network == 'DQN': network = MyQNetwork(environment=env, batch_size=32, freeze_interval=args.epochs[1], double_Q=True, random_state=rng) agent = NeuralAgent(env, network, replay_memory_size=min( int(args.epochs[0] * args.epochs[1] * 1.1), 100000), batch_size=32, random_state=rng) agent.setDiscountFactor(0.95) agent.attach(bc.FindBestController(validationID=0, unique_fname=args.fname)) agent.attach(bc.VerboseController()) agent.attach(bc.TrainerController()) agent.attach(
""" import numpy as np from deer.agent import NeuralAgent from deer.learning_algos.q_net_keras import MyQNetwork from Toy_env import MyEnv as Toy_env import deer.experiment.base_controllers as bc rng = np.random.RandomState(123456) # --- Instantiate environment --- env = Toy_env(rng) # --- Instantiate qnetwork --- qnetwork = MyQNetwork(environment=env, random_state=rng) # --- Instantiate agent --- agent = NeuralAgent(env, qnetwork, random_state=rng) # --- Bind controllers to the agent --- # Before every training epoch, we want to print a summary of the agent's epsilon, discount and # learning rate as well as the training epoch number. agent.attach(bc.VerboseController()) # During training epochs, we want to train the agent after every action it takes. # Plus, we also want to display after each training episode (!= than after every training) the average bellman # residual and the average of the V values obtained during the last episode. agent.attach(bc.TrainerController()) # We also want to interleave a "test epoch" between each training epoch.