示例#1
0
    def __init__(self,
                 action_size: int,
                 state_dim: int,
                 action_dim: int,
                 gamma: float,
                 sess: tf.Session,
                 optimizer: tf.train.Optimizer = tf.train.AdamOptimizer(
                     learning_rate=0.001),
                 max_tf_checkpoints_to_keep: int = 3,
                 experience_size: int = 1000,
                 per: bool = False,
                 batch_size: int = 64,
                 start_steps: int = 2000):
        self.optimizer = optimizer
        self.sess = sess
        self.gamma = gamma
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.action_size = action_size
        self.per = per

        self.actor = ActorNetwork(action_size=action_size,
                                  state_dim=state_dim,
                                  action_dim=action_dim,
                                  sess=sess,
                                  optimizer=optimizer)

        self.critic = CriticNetwork(action_size=action_size,
                                    state_dim=state_dim,
                                    action_dim=action_dim,
                                    sess=sess,
                                    optimizer=optimizer,
                                    gamma=gamma)

        self.eval_mode = False
        self.t = 0
        self.start_steps = start_steps
        self.training_steps = 0
        self.epsilon = 1
        self.batch_size = batch_size

        self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep)

        if self.per:
            self._replay = PER(experience_size)
        else:
            self._replay = ReplayBuffer(experience_size)

        self._last_state = None
        self._last_items = None
        self._last_action = None

        self.td_losses = []
        self.qvalues = []
示例#2
0
    def __init__(self,
                 sess,
                 state_dim,
                 action_dim,
                 epsilon=0.4,
                 action_size=4,
                 logdir='./logs/',
                 replay_size=1000,
                 batch_size=64):

        self._state_dim = state_dim
        self._action_dim = action_dim
        self._action_size = action_size

        self._logdir = logdir

        self._sess = sess

        self.epsilon = epsilon
        self.gamma = 0.9
        self.lr = 1e-4
        self.optimizer = tf.train.AdadeltaOptimizer(self.lr)

        self.state, self.action, self.agent, self.weights = self._create_network(
            'agent')

        self.qvalues = self.agent(tf.concat([self.state, self.action],
                                            axis=-1))

        self.target_state, self.target_action, self.target, self.target_weights = self._create_network(
            'target')
        self.target_qvalues = self.target(
            tf.concat([self.target_state, self.target_action], axis=-1))

        self.train_op, self.td_loss = self._create_train_op()
        self.target_update_op = self._create_target_update_op()

        self.merged = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(self._logdir,
                                                  self._sess.graph)
        self.summary = None

        self._replay = ReplayBuffer(replay_size)
        self.batch_size = batch_size
        self.td_losses = []

        self._last_state = None
        self._last_items = None
        self._last_action = None
        self.eval_mode = False
        self.training_steps = 0
示例#3
0
文件: agent.py 项目: shintay/rl
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                   self.action_size,
                                   self.action_low,
                                   self.action_high)

        self.actor_target = Actor(self.state_size,
                                    self.action_size,
                                    self.action_low,
                                    self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0  #0
        self.exploration_theta = 0.125 # 0.14 | 0.1
        self.exploration_sigma = 0.0009 # 0.001 | 0.2 | 0.001
        self.noise = OUNoise(self.action_size,
                             self.exploration_mu,
                             self.exploration_theta,
                             self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size,
                                   self.batch_size)

        # Algorithm parameters
        self.gamma = 0.998  # 0.99 | 0.9 | discount factor
        self.tau = 0.099  # 0.001| 0.01 | 0.1 | 0.05 |  for soft update of target parameters

        # Score tracker
        self.best_score = -np.inf
        self.score = 0
示例#4
0
    def __init__(
        self,
        state_size,
        network_id: str,
        buffer_size: int = int(20000),
        batch_size: int = 64,
        gamma: float = 0.99,
        lr: float = 1e-4,
        train_freq: int = 4,
        target_update_freq: int = 1000,
        min_epsilon: float = 0.05,
        epsilon_decay: float = 0.0005,
        scheduler_id: str = 'linear',
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.train_freq = train_freq
        self.epsilon_scheduler = scheduler_hub[scheduler_id](
            1, epsilon_decay, min_epsilon
        )
        self.t_step = 0
        self.memory = ReplayBuffer(buffer_size, batch_size, self.device)
        self.episode_logs = {}

        # self.future_network = FutureNetwork()
        # self.reward_network = RewardNetwork()
        # self.opt_future_reward = optim.RMSprop(self.future_network.parameters(), lr=self.lr)
        # self.keypoint_network = KeypointNetwork()
        # self.opt_keypoint = optim.RMSprop(self.future_network.parameters(), lr=self.lr)
        self.state_encoder = StateEncoder(state_size).to(self.device)
        self.state_decoder = StateDecoder(state_size).to(self.device)
        self.opt_ae = optim.RMSprop(
            list(self.state_encoder.parameters()) + list(self.state_decoder.parameters()),
            lr=self.lr,
        )
示例#5
0
    gif_req_m = mp.Value('i', -1)
    data_proc_list = []
    for _ in range(hp.N_ROLLOUT_PROCESSES):
        data_proc = mp.Process(target=data_func,
                               args=(pi, device, exp_queue, finish_event,
                                     gif_req_m, hp))
        data_proc.start()
        data_proc_list.append(data_proc)

    # Training
    tgt_Q = TargetCritic(Q)
    pi_opt = optim.Adam(pi.parameters(), lr=hp.LEARNING_RATE)
    Q_opt = optim.Adam(Q.parameters(), lr=hp.LEARNING_RATE)
    alpha_optim = optim.Adam([log_alpha], lr=hp.LEARNING_RATE)
    buffer = ReplayBuffer(buffer_size=hp.REPLAY_SIZE,
                          observation_space=hp.observation_space,
                          action_space=hp.action_space,
                          device=hp.DEVICE)
    n_grads = 0
    n_samples = 0
    n_episodes = 0
    best_reward = None
    last_gif = None

    try:
        while n_grads < hp.TOTAL_GRAD_STEPS:
            metrics = {}
            ep_infos = list()
            st_time = time.perf_counter()
            # Collect EXP_GRAD_RATIO sample for each grad step
            new_samples = 0
            while new_samples < hp.EXP_GRAD_RATIO:
示例#6
0
def main(args):
    device = "cuda" if args.cuda else "cpu"
    mp.set_start_method('spawn')
    # Input Experiment Hyperparameters
    hp = SACHP(EXP_NAME=args.name,
               DEVICE=device,
               ENV_NAME=args.env,
               N_ROLLOUT_PROCESSES=3,
               LEARNING_RATE=0.0001,
               EXP_GRAD_RATIO=10,
               BATCH_SIZE=256,
               GAMMA=0.95,
               REWARD_STEPS=3,
               ALPHA=0.015,
               LOG_SIG_MAX=2,
               LOG_SIG_MIN=-20,
               EPSILON=1e-6,
               REPLAY_SIZE=100000,
               REPLAY_INITIAL=512,
               SAVE_FREQUENCY=100000,
               GIF_FREQUENCY=100000,
               TOTAL_GRAD_STEPS=1000000)
    wandb.init(project='RoboCIn-RL',
               name=hp.EXP_NAME,
               entity='robocin',
               config=hp.to_dict())
    current_time = datetime.datetime.now().strftime('%b-%d_%H-%M-%S')
    tb_path = os.path.join(
        'runs', current_time + '_' + hp.ENV_NAME + '_' + hp.EXP_NAME)
    # Training
    sac = SAC(hp)
    buffer = ReplayBuffer(buffer_size=hp.REPLAY_SIZE,
                          observation_space=hp.observation_space,
                          action_space=hp.action_space,
                          device=hp.DEVICE)

    # Playing
    sac.share_memory()
    exp_queue = mp.Queue(maxsize=hp.EXP_GRAD_RATIO)
    finish_event = mp.Event()
    gif_req_m = mp.Value('i', -1)
    data_proc = mp.Process(target=rollout,
                           args=(sac, device, exp_queue, finish_event,
                                 gif_req_m, hp))
    data_proc.start()

    n_grads = 0
    n_samples = 0
    n_episodes = 0
    best_reward = None
    last_gif = None
    try:
        while n_grads < hp.TOTAL_GRAD_STEPS:
            metrics = {}
            ep_infos = list()
            st_time = time.perf_counter()
            # Collect EXP_GRAD_RATIO sample for each grad step
            new_samples = 0
            while new_samples < hp.EXP_GRAD_RATIO:
                exp = exp_queue.get()
                if exp is None:
                    raise Exception  # got None value in queue
                safe_exp = copy.deepcopy(exp)
                del (exp)

                # Dict is returned with end of episode info
                if isinstance(safe_exp, dict):
                    logs = {
                        "ep_info/" + key: value
                        for key, value in safe_exp.items()
                        if 'truncated' not in key
                    }
                    ep_infos.append(logs)
                    n_episodes += 1
                else:
                    if safe_exp.last_state is not None:
                        last_state = safe_exp.last_state
                    else:
                        last_state = safe_exp.state
                    buffer.add(obs=safe_exp.state,
                               next_obs=last_state,
                               action=safe_exp.action,
                               reward=safe_exp.reward,
                               done=False
                               if safe_exp.last_state is not None else True)
                    new_samples += 1
            n_samples += new_samples
            sample_time = time.perf_counter()

            # Only start training after buffer is larger than initial value
            if buffer.size() < hp.REPLAY_INITIAL:
                continue

            # Sample a batch and load it as a tensor on device
            batch = buffer.sample(hp.BATCH_SIZE)
            metrics["train/loss_pi"], metrics["train/loss_Q1"], \
                metrics["train/loss_Q2"], metrics["train/loss_alpha"], \
                metrics["train/alpha"] = sac.update(batch=batch,
                                                    metrics=metrics)

            n_grads += 1
            grad_time = time.perf_counter()
            metrics['speed/samples'] = new_samples / (sample_time - st_time)
            metrics['speed/grad'] = 1 / (grad_time - sample_time)
            metrics['speed/total'] = 1 / (grad_time - st_time)
            metrics['counters/samples'] = n_samples
            metrics['counters/grads'] = n_grads
            metrics['counters/episodes'] = n_episodes
            metrics["counters/buffer_len"] = buffer.size()

            if ep_infos:
                for key in ep_infos[0].keys():
                    metrics[key] = np.mean([info[key] for info in ep_infos])

            # Log metrics
            wandb.log(metrics)
            if hp.SAVE_FREQUENCY and n_grads % hp.SAVE_FREQUENCY == 0:
                save_checkpoint(hp=hp,
                                metrics={
                                    'alpha': sac.alpha,
                                    'n_samples': n_samples,
                                    'n_grads': n_grads,
                                    'n_episodes': n_episodes
                                },
                                pi=sac.pi,
                                Q=sac.Q,
                                pi_opt=sac.pi_opt,
                                Q_opt=sac.Q_opt)

            if hp.GIF_FREQUENCY and n_grads % hp.GIF_FREQUENCY == 0:
                gif_req_m.value = n_grads

    except KeyboardInterrupt:
        print("...Finishing...")
        finish_event.set()

    finally:
        if exp_queue:
            while exp_queue.qsize() > 0:
                exp_queue.get()

        print('queue is empty')

        print("Waiting for threads to finish...")
        data_proc.terminate()
        data_proc.join()

        del (exp_queue)
        del (sac)

        finish_event.set()
示例#7
0
def training_loop(agent_config, env_config, vaccination_schedule):
    action_dim = len(env_config["groups"]) * 2 * len(vaccination_schedule)
    state_dim = len(env_config["groups"]) + len(vaccination_schedule) * len(
        env_config["groups"]) + len(vaccination_schedule)
    batch_size = 100
    episodes = 150
    max_iterations = env_config["max_time_steps"]

    env = VaccinationEnvironment(env_config, vaccination_schedule)
    agent = TD3Agent([state_dim + action_dim, 256, 256, 1],
                     [state_dim, 256, 256, action_dim])
    replay_buffer = ReplayBuffer(state_dim, action_dim)

    # get starting state
    state = []
    for items in list(env.get_cases().values()):
        for value in items:
            state.append(value)
    state.append(env.get_vaccines()[0].num)

    # first sample enough data
    for i in range(2 * batch_size):
        available_vaccines = env.get_vaccines()[0].num

        action = agent.act(state)

        vaccination_plan = []
        for index in range(0, len(action), 2):
            group_vac_1 = int(action[index] * available_vaccines)
            group_vac_2 = int(action[index + 1] * available_vaccines)
            plan = Vaccination_Plan(JOHNSON, group_vac_1, group_vac_2)
            vaccination_plan.append([plan])

        info, done = env.step(vaccination_plan, False)
        reward = -sum([values[1] for values in info.values()])

        next_state = []
        for items in list(env.get_cases().values()):
            for value in items:
                next_state.append(value)
        next_state.append(available_vaccines)

        replay_buffer.add(state, action, next_state, reward, done)

        state = next_state

    # training
    rewards = []
    losses = []
    # get starting  state
    for i in tqdm(range(episodes)):
        episode_reward = []
        episode_loss = []

        env.reset()
        state = []
        for items in list(env.get_cases().values()):
            for value in items:
                state.append(value)
        state.append(env.get_vaccines()[0].num)

        for j in range(max_iterations):
            available_vaccines = env.get_vaccines()[0].num
            action = agent.act(state)

            vaccination_plan = []
            for index in range(0, len(action), 2):
                group_vac_1 = int(action[index] * available_vaccines)
                group_vac_2 = int(action[index + 1] * available_vaccines)
                plan = Vaccination_Plan(JOHNSON, group_vac_1, group_vac_2)
                vaccination_plan.append([plan])

            info, done = env.step(vaccination_plan, True)
            reward = sum([values[1] for values in info.values()])
            episode_reward.append(reward)

            next_state = []
            for items in list(env.get_cases().values()):
                for value in items:
                    next_state.append(value)
            next_state.append(available_vaccines)

            replay_buffer.add(state, action, next_state, reward, done)
            state = next_state

            # train
            train_states, train_actions, train_next_states, train_reward, train_done = replay_buffer.sample(
                batch_size)
            loss = agent.train(train_states, train_actions, train_next_states,
                               train_reward, train_done)

            episode_loss.append(loss.detach().numpy())

            if done:
                rewards.append(sum(episode_reward))
                losses.append(sum(episode_loss) / len(episode_loss))
                break

    # finally save data:
    data = pd.DataFrame(data={"rewards": rewards, "losses": losses})
    data.to_csv(
        "P:/Dokumente/3 Uni/WiSe2021/Hackathon/Hackathon_KU/exp/performance.csv",
        sep=",",
        index=False)
            target=data_func,
            args=(pi, device, exp_queue, finish_event, sigma_m, gif_req_m, hp),
        )
        data_proc.start()
        data_proc_list.append(data_proc)
    pi_opt = optim.Adam(pi.parameters(), lr=hp.LEARNING_RATE)
    Q_opt = optim.Adam(Q.parameters(), lr=hp.LEARNING_RATE)
    # Training
    tgt_pi = TargetActor(pi)
    tgt_Q = TargetCritic(Q)
    tgt_Q_strat = TargetCritic(Q_strat)
    Q_strat_opt = optim.Adam(Q_strat.parameters(), lr=hp.LEARNING_RATE)
    buffer = ReplayBuffer(
        buffer_size=hp.REPLAY_SIZE,
        observation_space=hp.observation_space,
        action_space=hp.action_space,
        device=hp.DEVICE,
        n_rew=hp.N_REWARDS,
    )
    n_grads = 0
    n_samples = 0
    n_episodes = 0
    best_reward = None
    last_gif = None

    try:
        alphas = torch.Tensor([0.6600, 0.3200, 0.0053, 0.0080]).to(device)
        while n_grads < hp.TOTAL_GRAD_STEPS:
            metrics = {}
            ep_infos = list()
            st_time = time.perf_counter()
示例#9
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 seed=23520,
                 GRADIENT_CLIP=1,
                 ACTIVATION=F.relu,
                 BOOTSTRAP_SIZE=5,
                 GAMMA=0.99,
                 TAU=1e-3,
                 LR_CRITIC=5e-4,
                 LR_ACTOR=5e-4,
                 UPDATE_EVERY=1,
                 TRANSFER_EVERY=2,
                 UPDATE_LOOP=10,
                 ADD_NOISE_EVERY=5,
                 WEIGHT_DECAY=0,
                 MEMORY_SIZE=5e4,
                 BATCH_SIZE=64):
        """Initialize an Agent object.
        
        Params
        ======
            state_size  : dimension of each state
            action_size : dimension of each action
            num_agents  : number of running agents
            device: cpu or cuda:0 if available
            -----These are hyperparameters----
            BOOTSTRAP_SIZE      : How far ahead to bootstrap
            GAMMA               : Discount factor
            TAU                 : Parameter for performing soft updates of target parameters
            LR_CRITIC, LR_ACTOR : Learning rate of the networks
            UPDATE_EVERY        : How often to update the networks
            TRANSFER_EVERY      : How often to transfer the weights from local to target
            UPDATE_LOOP         : Number of iterations for network update
            ADD_NOISE_EVERY     : How often to add noise to favor exploration
            WEIGHT_DECAY        : L2 weight decay for critic optimizer
            GRADIENT_CLIP       : Limit of gradient to be clipped, to avoid exploding gradient issue
        """

        # Actor networks
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(),
                                      lr=LR_ACTOR,
                                      weight_decay=WEIGHT_DECAY)
        hard_update(self.actor_local, self.actor_target)

        #critic networks
        self.critic_local = Critic(state_size * 2, action_size).to(device)
        self.critic_target = Critic(state_size * 2, action_size).to(device)
        self.critic_optim = optim.Adam(self.critic_local.parameters(),
                                       lr=LR_CRITIC,
                                       weight_decay=WEIGHT_DECAY)
        hard_update(self.critic_local, self.critic_target)

        self.device = device
        self.num_agents = num_agents

        # Noise : using simple noise instead of OUNoise
        self.noise = [
            SimpleNoise(action_size, scale=1) for i in range(num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size, device, int(MEMORY_SIZE),
                                   BATCH_SIZE, seed)

        # Initialize time steps (for updating every UPDATE_EVERY steps)
        self.u_step = 0
        self.n_step = 0

        #keeping hyperparameters within the instance
        self.BOOTSTRAP_SIZE = BOOTSTRAP_SIZE
        self.GAMMA = GAMMA
        self.TAU = TAU
        self.LR_CRITIC = LR_CRITIC
        self.LR_ACTOR = LR_ACTOR
        self.UPDATE_EVERY = UPDATE_EVERY
        self.TRANSFER_EVERY = TRANSFER_EVERY
        self.UPDATE_LOOP = UPDATE_LOOP
        self.ADD_NOISE_EVERY = ADD_NOISE_EVERY
        self.GRADIENT_CLIP = GRADIENT_CLIP

        # initialize these variables to store the information of the n-previous timestep that are necessary to apply the bootstrap_size
        self.rewards = deque(maxlen=BOOTSTRAP_SIZE)
        self.states = deque(maxlen=BOOTSTRAP_SIZE)
        self.actions = deque(maxlen=BOOTSTRAP_SIZE)
        self.gammas = np.array([[GAMMA**i for j in range(num_agents)]
                                for i in range(BOOTSTRAP_SIZE)])

        self.loss_function = torch.nn.SmoothL1Loss()