Пример #1
0
def launch_after_training(params, net_state_dict, device, episodes, opt_steps):
    env = Assault(23)
    net = DQN(env.state_sz,
              env.action_sz,
              "vae",
              params["image_input"] == "True",
              device=device).to(device)
    net.load_state_dict(net_state_dict)
    controller = FixedController(
        lambda state, explore: net(state.to(device)).max(1)[1].item())
    agent = Agent(env, controller)

    plot_name = "AfterTraining"
    log().add_plot(plot_name,
                   columns=("train_episode", "train_steps", "reward"))
    pbar = tqdm(range(episodes))
    total_steps = 0
    for episode in pbar:
        pbar.set_description("Episode [{}/{}] Step[{}/{}] Exploit".format(
            episode + 1, episodes, total_steps, opt_steps))

        reward, steps = agent.rollout(train=False)
        total_steps += steps
        log().add_plot_point(plot_name, (episode, total_steps, reward))

        if total_steps >= opt_steps:
            break

    log().save_logs()
Пример #2
0
 def __init__(self, id, env):        
     Agent.__init__(self, id, env)
     self._energy = Parameters.initEnergy
     self._reproductionMinEnergy = Parameters.reproductionMinEnergy
     self._genotype = []
     self._rand = Random()                
     getattr(self, "_initialization"+Parameters.initialization)()
     self._fitness=None
     self._updated=None
     self._recalculateFitness()
     self._fitnessCalls = 0
Пример #3
0
 def __init__(self, id, env):
     Agent.__init__(self, id, env)
     self._energy = Parameters.initEnergy
     self._reproductionMinEnergy = Parameters.reproductionMinEnergy
     self._genotype = []
     self._rand = Random()
     rand = self._rand
     for i in xrange(Parameters.genotypeLength):
         if rand.randint(0, 100) > 0:
             self._genotype.append(Parameters.cubeSize * rand.random())
         else:
             self._genotype.append(-1 * Parameters.cubeSize * rand.random())
Пример #4
0
def train(experiment):
    env = experiment.env(random_state=experiment.random_seed)
    memory = ReplayMemory(experiment.hyperparams.memory_config.memory_size)
    controller = ControllerDQN(env=env,
                               memory=memory,
                               params=experiment.hyperparams,
                               prune_percent=experiment.prune_percent,
                               pruner=experiment.pruner,
                               stop_criterion=experiment.stop_criterion,
                               device=experiment.device)
    agent = Agent(env, controller)

    EXPLORE_ITERS = 1
    EXPLOIT_ITERS = 1

    episodes, prune_iters, opt_steps = experiment.episodes, experiment.prune_iters, experiment.opt_steps

    for iter in range(prune_iters):
        pbar = tqdm(range(episodes))
        cur_percent = (1 - experiment.prune_percent / 100)**iter
        explore_plot = "Explore_iter" + str(iter) + "_prune" + str(cur_percent)
        exploit_plot = "Exploit_iter" + str(iter) + "_prune" + str(cur_percent)
        log().add_plot(explore_plot,
                       columns=("train_episode", "train_steps", "reward"))
        log().add_plot(exploit_plot,
                       columns=("train_episode", "train_steps", "reward"))

        for episode in pbar:
            # once in EXPLORE_ITERS train rollouts, do EXPLOIT_ITERS exploit rollouts
            if episode % EXPLORE_ITERS == EXPLORE_ITERS - 1:
                for _ in range(EXPLOIT_ITERS):
                    pbar.set_description(
                        "Iter[{}/{}] Episode [{}/{}] Step[{}/{}] Exploit".
                        format(iter + 1, prune_iters, episode + 1, episodes,
                               controller.steps_done, opt_steps))
                    exploit(agent, episode, exploit_plot)

            pbar.set_description(
                "Iter[{}/{}] Episode [{}/{}] Step[{}/{}] Explore".format(
                    iter + 1, prune_iters, episode + 1, episodes,
                    controller.steps_done, opt_steps))
            explore(agent, episode, explore_plot)

            if controller.steps_done >= opt_steps:
                break
            if controller.optimization_completed(
            ) and not iter + 1 == prune_iters:  # no stop on last iteration
                break

            torch.cuda.empty_cache()

        log().save_logs()
        log().save_model(controller.get_state(),
                         "model:iter{}:{}".format(iter, cur_percent))

        controller.prune()
        controller.reinit()
Пример #5
0
agent = Agent(name="agent_" +
              str(np.random.randint(low=1000000, high=9999999)),
              actionScaling=1.0,
              policyNetworkSize=[256, 256],
              qNetworkSize=[256, 256],
              numQuantiles=16,
              policyNetworkLearningRate=3e-4,
              qNetworkLearningRate=3e-4,
              entropyCoefficient="auto",
              tau=0.005,
              gamma=0.99,
              kappa=1.0,
              maxMemoryLength=int(1e5),
              priorityExponent=0.0,
              batchSize=64,
              nStep=3,
              frameSkip=2,
              maxEpisodes=4096,
              trainSteps=1024,
              maxTrainSteps=6000000,
              minStepsBeforeTraining=10000,
              rewardScaling=(10.0**-0.75),
              actionShift=0.0,
              stepsPerUpdate=1,
              render=False,
              showGraphs=True,
              saveModel=True,
              saveModelToS3=False,
              restoreModel=False,
              train=True,
              testSteps=1024,
              maxMinutes=360,
              targetEntropy=-4.0,
              maxGradientNorm=5.0,
              meanRegularizationConstant=0.0,
              varianceRegularizationConstant=0.0,
              randomStartSteps=10000,
              gradientSteps=1,
              initialExtraNoise=0,
              extraNoiseDecay=0,
              evaluationEvery=25,
              numFinalEvaluations=10)
def optimize_joint(system_nn, pol_nn, log_writer, **kwargs):
    # unpack kwargs
    horizon = kwargs.get("horizon")
    nb_iterations = kwargs.get("nb_iterations")
    batch_size = kwargs.get("batch_size")

    policy_fit = kwargs.get("policy", False)
    system_fit = kwargs.get("system", False)

    mc_samples = kwargs.get("mc_samples", 128)

    env = Environment(system_nn)
    agent = Agent(pol_nn, env, horizon)

    # Optimizers
    parameters_list = []
    if policy_fit:
        parameters_list = parameters_list + list(pol_nn.parameters())

    if system_fit:
        parameters_list = parameters_list + list(system_nn.parameters())
    if parameters_list:
        lr = kwargs.get("learning_rate", .001)
        optimizer = Adam(parameters_list, lr=lr)

        for it in range(nb_iterations):
            loss = {}
            params = {}

            # set gradient to zero
            optimizer.zero_grad()

            # generate the batch
            _, states_batch, dist_batch, _, oha_batch, rew_batch = agent.sample_trajectory(
                batch_size)

            # Loss #
            system_loss = system_error(system_nn, pol_nn, states_batch,
                                       dist_batch, oha_batch, rew_batch)

            system_loss.backward(retain_graph=policy_fit)

            optimizer.step()
            system_nn.project_parameters()
            pol_nn.project_parameters()

            if system_fit and log_writer is not None:
                params['system'] = system_nn.unwrapped.named_parameters()
                log_writer.add_system_parameters(system_nn.parameters_dict(),
                                                 step=it)

            if policy_fit and log_writer is not None:
                params['policy'] = pol_nn.named_parameters()
                actions = pol_nn(
                    states_batch)  # (B, H, A), need to stack along the B dim
                log_writer.add_policy_histograms(actions.view(
                    -1, actions.shape[2]),
                                                 step=it)

            if log_writer is not None:
                loss['loss'] = system_loss.item()

                log_writer.add_grad_histograms(params, step=it)
                log_writer.add_loss(loss, step=it)

                # performance of the agent on the epoch
                ep_perf, return_estimate = agent.avg_performance(mc_samples)
                log_writer.add_expected_return(ep_perf, step=it)
                log_writer.add_return(return_estimate, step=it)

    return env, agent
Пример #7
0
import torch
import torch.optim as optim

from agent.Agent import Agent
from environment.Environment import Environment
from policy.Policy import Policy
from utils.utils import plot_training_evolution

learning_rate = 0.01
discount_factor = 0.99
episodes = 5000

env = Environment('LunarLander-v2')
policy: Policy = Policy(env.observation_space(), env.action_space())
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
agent = Agent(env, policy, optimizer)

for episode in range(episodes):

    agent.run_episode()
    agent.update_policy(discount_factor=discount_factor)

    if episode % 50 == 0:
        print('Episode {}\tAverage reward: {}'.format(
            episode,
            np.array(policy.reward_history[-50:]).mean()))

        if env.is_solved(np.array(policy.reward_history[-50:]).mean()):
            break

torch.save(policy.state_dict(), 'saved_policy/policy.pt')
Пример #8
0
 def __init__(self, env, childrenEnv=None):
     addr = AddressManager.getAddress(None)
     Agent.__init__(self, addr, env, childrenEnv)
     self._reprCount = 0
     self._rand = Random()
agent = Agent(name="agent_2943367",
              actionScaling=1.0,
              policyNetworkSize=[256, 256],
              qNetworkSize=[256, 256],
              policyNetworkLearningRate=3e-4,
              qNetworkLearningRate=3e-4,
              entropyCoefficient="auto",
              tau=0.005,
              gamma=0.99,
              maxMemoryLength=int(5e6),
              priorityExponent=0.0,
              batchSize=256,
              maxEpisodes=0,
              trainSteps=1024,
              minStepsBeforeTraining=4096,
              rewardScaling=rewardScaling,
              actionShift=0.0,
              stepsPerUpdate=1,
              render=True,
              showGraphs=False,
              saveModel=False,
              restoreModel=True,
              train=False,
              testSteps=1024,
              maxMinutes=60,
              targetEntropy=-4.0,
              maxGradientNorm=5.0,
              meanRegularizationConstant=0.0,
              varianceRegularizationConstant=0.0,
              randomStartSteps=0,
              gradientSteps=1,
              initialExtraNoise=0,
              extraNoiseDecay=0,
              evaluationEvery=250,
              numFinalEvaluations=10)
Пример #10
0
result = -20000
try:
    agent = Agent(name="agent_" +
                  str(np.random.randint(low=1000000, high=9999999)),
                  policyNetworkSize=[256, 256],
                  qNetworkSize=[256, 256],
                  valueNetworkSize=[256, 256],
                  entropyCoefficient=entropyCoefficient,
                  valueNetworkLearningRate=learningRate,
                  policyNetworkLearningRate=learningRate,
                  qNetworkLearningRate=learningRate,
                  tau=0.005,
                  gamma=0.99,
                  maxMemoryLength=int(1e6),
                  priorityExponent=0,
                  batchSize=256,
                  maxGradientNorm=5,
                  maxEpisodes=1024,
                  trainSteps=1024,
                  minStepsBeforeTraining=4096,
                  rewardScaling=rewardScaling,
                  actionScaling=actionScaling,
                  actionShift=0.0,
                  stepsPerUpdate=1,
                  render=True,
                  showGraphs=True,
                  meanRegularizationConstant=weightRegularizationConstant,
                  varianceRegularizationConstant=weightRegularizationConstant,
                  testSteps=1024,
                  maxMinutes=600)

    result = agent.execute()
Пример #11
0
    def __init__(self, name, owner):

        self.agent = Agent(name, owner)

        self.name = name
        self.owner = owner
Пример #12
0
parser.add_argument('--mode', type=str, help='Mode', default="train", choices=["train", "infer"])
args = parser.parse_args()

ray.init()

config = {}

with open(args.agent_config_path) as config_file:
    config.update(json.load(config_file))
    config["explore"] = True if args.mode == "train" else False

with open(args.environment_config_path) as config_file:
    config.update(json.load(config_file))

env = Environment(config)
agent = Agent(config)

if args.mode == "train":

    config.update({

        "num_gpus": 0,
        "num_workers": 1,

        "monitor": False
    })

    tune.run(
        agent.__class__,
        name=env.__class__.__name__+"_"+agent.__class__.__name__,
        stop={"episode_reward_mean": -100},
Пример #13
0
import queue
import time
import datetime
from data.DataSettings import DataSettings
from data_streamer.DataStreamer import DataStreamer
from data_logger.DataLogger import DataLogger
from agent.Agent import Agent
from sensors.PowerButton import PowerButton
from sensors.DataButton import DataButton
from sensors.RelaySensor import RelaySensor

data_settings = DataSettings()
data_streamer = DataStreamer()
data_logger = DataLogger(data_streamer)
relay = RelaySensor()
agent = Agent(data_settings, relay)
power_button_pin, data_button_pin = 18, 19  # Grove Base Hat connector D18
power_button = PowerButton(power_button_pin)
data_button = DataButton(data_button_pin)
command_queue = queue.Queue()


def setup():
    data_settings.load_settings()
    agent.update()
    t1 = threading.Thread(target=read_incoming_serial)
    t1.start()


def read_incoming_serial():
    previous_serial_data = ""