def launch_after_training(params, net_state_dict, device, episodes, opt_steps): env = Assault(23) net = DQN(env.state_sz, env.action_sz, "vae", params["image_input"] == "True", device=device).to(device) net.load_state_dict(net_state_dict) controller = FixedController( lambda state, explore: net(state.to(device)).max(1)[1].item()) agent = Agent(env, controller) plot_name = "AfterTraining" log().add_plot(plot_name, columns=("train_episode", "train_steps", "reward")) pbar = tqdm(range(episodes)) total_steps = 0 for episode in pbar: pbar.set_description("Episode [{}/{}] Step[{}/{}] Exploit".format( episode + 1, episodes, total_steps, opt_steps)) reward, steps = agent.rollout(train=False) total_steps += steps log().add_plot_point(plot_name, (episode, total_steps, reward)) if total_steps >= opt_steps: break log().save_logs()
def __init__(self, id, env): Agent.__init__(self, id, env) self._energy = Parameters.initEnergy self._reproductionMinEnergy = Parameters.reproductionMinEnergy self._genotype = [] self._rand = Random() getattr(self, "_initialization"+Parameters.initialization)() self._fitness=None self._updated=None self._recalculateFitness() self._fitnessCalls = 0
def __init__(self, id, env): Agent.__init__(self, id, env) self._energy = Parameters.initEnergy self._reproductionMinEnergy = Parameters.reproductionMinEnergy self._genotype = [] self._rand = Random() rand = self._rand for i in xrange(Parameters.genotypeLength): if rand.randint(0, 100) > 0: self._genotype.append(Parameters.cubeSize * rand.random()) else: self._genotype.append(-1 * Parameters.cubeSize * rand.random())
def train(experiment): env = experiment.env(random_state=experiment.random_seed) memory = ReplayMemory(experiment.hyperparams.memory_config.memory_size) controller = ControllerDQN(env=env, memory=memory, params=experiment.hyperparams, prune_percent=experiment.prune_percent, pruner=experiment.pruner, stop_criterion=experiment.stop_criterion, device=experiment.device) agent = Agent(env, controller) EXPLORE_ITERS = 1 EXPLOIT_ITERS = 1 episodes, prune_iters, opt_steps = experiment.episodes, experiment.prune_iters, experiment.opt_steps for iter in range(prune_iters): pbar = tqdm(range(episodes)) cur_percent = (1 - experiment.prune_percent / 100)**iter explore_plot = "Explore_iter" + str(iter) + "_prune" + str(cur_percent) exploit_plot = "Exploit_iter" + str(iter) + "_prune" + str(cur_percent) log().add_plot(explore_plot, columns=("train_episode", "train_steps", "reward")) log().add_plot(exploit_plot, columns=("train_episode", "train_steps", "reward")) for episode in pbar: # once in EXPLORE_ITERS train rollouts, do EXPLOIT_ITERS exploit rollouts if episode % EXPLORE_ITERS == EXPLORE_ITERS - 1: for _ in range(EXPLOIT_ITERS): pbar.set_description( "Iter[{}/{}] Episode [{}/{}] Step[{}/{}] Exploit". format(iter + 1, prune_iters, episode + 1, episodes, controller.steps_done, opt_steps)) exploit(agent, episode, exploit_plot) pbar.set_description( "Iter[{}/{}] Episode [{}/{}] Step[{}/{}] Explore".format( iter + 1, prune_iters, episode + 1, episodes, controller.steps_done, opt_steps)) explore(agent, episode, explore_plot) if controller.steps_done >= opt_steps: break if controller.optimization_completed( ) and not iter + 1 == prune_iters: # no stop on last iteration break torch.cuda.empty_cache() log().save_logs() log().save_model(controller.get_state(), "model:iter{}:{}".format(iter, cur_percent)) controller.prune() controller.reinit()
agent = Agent(name="agent_" + str(np.random.randint(low=1000000, high=9999999)), actionScaling=1.0, policyNetworkSize=[256, 256], qNetworkSize=[256, 256], numQuantiles=16, policyNetworkLearningRate=3e-4, qNetworkLearningRate=3e-4, entropyCoefficient="auto", tau=0.005, gamma=0.99, kappa=1.0, maxMemoryLength=int(1e5), priorityExponent=0.0, batchSize=64, nStep=3, frameSkip=2, maxEpisodes=4096, trainSteps=1024, maxTrainSteps=6000000, minStepsBeforeTraining=10000, rewardScaling=(10.0**-0.75), actionShift=0.0, stepsPerUpdate=1, render=False, showGraphs=True, saveModel=True, saveModelToS3=False, restoreModel=False, train=True, testSteps=1024, maxMinutes=360, targetEntropy=-4.0, maxGradientNorm=5.0, meanRegularizationConstant=0.0, varianceRegularizationConstant=0.0, randomStartSteps=10000, gradientSteps=1, initialExtraNoise=0, extraNoiseDecay=0, evaluationEvery=25, numFinalEvaluations=10)
def optimize_joint(system_nn, pol_nn, log_writer, **kwargs): # unpack kwargs horizon = kwargs.get("horizon") nb_iterations = kwargs.get("nb_iterations") batch_size = kwargs.get("batch_size") policy_fit = kwargs.get("policy", False) system_fit = kwargs.get("system", False) mc_samples = kwargs.get("mc_samples", 128) env = Environment(system_nn) agent = Agent(pol_nn, env, horizon) # Optimizers parameters_list = [] if policy_fit: parameters_list = parameters_list + list(pol_nn.parameters()) if system_fit: parameters_list = parameters_list + list(system_nn.parameters()) if parameters_list: lr = kwargs.get("learning_rate", .001) optimizer = Adam(parameters_list, lr=lr) for it in range(nb_iterations): loss = {} params = {} # set gradient to zero optimizer.zero_grad() # generate the batch _, states_batch, dist_batch, _, oha_batch, rew_batch = agent.sample_trajectory( batch_size) # Loss # system_loss = system_error(system_nn, pol_nn, states_batch, dist_batch, oha_batch, rew_batch) system_loss.backward(retain_graph=policy_fit) optimizer.step() system_nn.project_parameters() pol_nn.project_parameters() if system_fit and log_writer is not None: params['system'] = system_nn.unwrapped.named_parameters() log_writer.add_system_parameters(system_nn.parameters_dict(), step=it) if policy_fit and log_writer is not None: params['policy'] = pol_nn.named_parameters() actions = pol_nn( states_batch) # (B, H, A), need to stack along the B dim log_writer.add_policy_histograms(actions.view( -1, actions.shape[2]), step=it) if log_writer is not None: loss['loss'] = system_loss.item() log_writer.add_grad_histograms(params, step=it) log_writer.add_loss(loss, step=it) # performance of the agent on the epoch ep_perf, return_estimate = agent.avg_performance(mc_samples) log_writer.add_expected_return(ep_perf, step=it) log_writer.add_return(return_estimate, step=it) return env, agent
import torch import torch.optim as optim from agent.Agent import Agent from environment.Environment import Environment from policy.Policy import Policy from utils.utils import plot_training_evolution learning_rate = 0.01 discount_factor = 0.99 episodes = 5000 env = Environment('LunarLander-v2') policy: Policy = Policy(env.observation_space(), env.action_space()) optimizer = optim.Adam(policy.parameters(), lr=learning_rate) agent = Agent(env, policy, optimizer) for episode in range(episodes): agent.run_episode() agent.update_policy(discount_factor=discount_factor) if episode % 50 == 0: print('Episode {}\tAverage reward: {}'.format( episode, np.array(policy.reward_history[-50:]).mean())) if env.is_solved(np.array(policy.reward_history[-50:]).mean()): break torch.save(policy.state_dict(), 'saved_policy/policy.pt')
def __init__(self, env, childrenEnv=None): addr = AddressManager.getAddress(None) Agent.__init__(self, addr, env, childrenEnv) self._reprCount = 0 self._rand = Random()
agent = Agent(name="agent_2943367", actionScaling=1.0, policyNetworkSize=[256, 256], qNetworkSize=[256, 256], policyNetworkLearningRate=3e-4, qNetworkLearningRate=3e-4, entropyCoefficient="auto", tau=0.005, gamma=0.99, maxMemoryLength=int(5e6), priorityExponent=0.0, batchSize=256, maxEpisodes=0, trainSteps=1024, minStepsBeforeTraining=4096, rewardScaling=rewardScaling, actionShift=0.0, stepsPerUpdate=1, render=True, showGraphs=False, saveModel=False, restoreModel=True, train=False, testSteps=1024, maxMinutes=60, targetEntropy=-4.0, maxGradientNorm=5.0, meanRegularizationConstant=0.0, varianceRegularizationConstant=0.0, randomStartSteps=0, gradientSteps=1, initialExtraNoise=0, extraNoiseDecay=0, evaluationEvery=250, numFinalEvaluations=10)
result = -20000 try: agent = Agent(name="agent_" + str(np.random.randint(low=1000000, high=9999999)), policyNetworkSize=[256, 256], qNetworkSize=[256, 256], valueNetworkSize=[256, 256], entropyCoefficient=entropyCoefficient, valueNetworkLearningRate=learningRate, policyNetworkLearningRate=learningRate, qNetworkLearningRate=learningRate, tau=0.005, gamma=0.99, maxMemoryLength=int(1e6), priorityExponent=0, batchSize=256, maxGradientNorm=5, maxEpisodes=1024, trainSteps=1024, minStepsBeforeTraining=4096, rewardScaling=rewardScaling, actionScaling=actionScaling, actionShift=0.0, stepsPerUpdate=1, render=True, showGraphs=True, meanRegularizationConstant=weightRegularizationConstant, varianceRegularizationConstant=weightRegularizationConstant, testSteps=1024, maxMinutes=600) result = agent.execute()
def __init__(self, name, owner): self.agent = Agent(name, owner) self.name = name self.owner = owner
parser.add_argument('--mode', type=str, help='Mode', default="train", choices=["train", "infer"]) args = parser.parse_args() ray.init() config = {} with open(args.agent_config_path) as config_file: config.update(json.load(config_file)) config["explore"] = True if args.mode == "train" else False with open(args.environment_config_path) as config_file: config.update(json.load(config_file)) env = Environment(config) agent = Agent(config) if args.mode == "train": config.update({ "num_gpus": 0, "num_workers": 1, "monitor": False }) tune.run( agent.__class__, name=env.__class__.__name__+"_"+agent.__class__.__name__, stop={"episode_reward_mean": -100},
import queue import time import datetime from data.DataSettings import DataSettings from data_streamer.DataStreamer import DataStreamer from data_logger.DataLogger import DataLogger from agent.Agent import Agent from sensors.PowerButton import PowerButton from sensors.DataButton import DataButton from sensors.RelaySensor import RelaySensor data_settings = DataSettings() data_streamer = DataStreamer() data_logger = DataLogger(data_streamer) relay = RelaySensor() agent = Agent(data_settings, relay) power_button_pin, data_button_pin = 18, 19 # Grove Base Hat connector D18 power_button = PowerButton(power_button_pin) data_button = DataButton(data_button_pin) command_queue = queue.Queue() def setup(): data_settings.load_settings() agent.update() t1 = threading.Thread(target=read_incoming_serial) t1.start() def read_incoming_serial(): previous_serial_data = ""