Пример #1
0
 def process_batch(engine, batch):
     optimizer.zero_grad()
     loss_v = common.calc_loss_dqn(batch,
                                   net,
                                   tgt_net.target_model,
                                   gamma=params.gamma,
                                   device=device)
     loss_v.backward()
     optimizer.step()
     epsilon_tracker.frame(engine.state.iteration)
     if engine.state.iteration % params.target_net_sync == 0:
         tgt_net.sync()
     if engine.state.iteration % EVAL_EVERY_FRAME == 0:
         eval_states = getattr(engine.state, "eval_states", None)
         if eval_states is None:
             eval_states = buffer.sample(STATES_TO_EVALUATE)
             eval_states = [
                 np.array(transition.state, copy=False)
                 for transition in eval_states
             ]
             eval_states = np.array(eval_states, copy=False)
             engine.state.eval_states = eval_states
         evaluate_states(eval_states, net, device, engine)
     return {
         "loss": loss_v.item(),
         "epsilon": selector.epsilon,
     }
 def process_batch(engine, batch):
     optimizer.zero_grad()
     loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model,
                                   gamma=params.gamma, device=device)
     loss_v.backward()
     optimizer.step()
     if engine.state.iteration % params.target_net_sync == 0:
         tgt_net.sync()
     return {
         "loss": loss_v.item(),
         "epsilon": batch_generator.epsilon,
     }
 def process_batch(engine, batch):
     optimizer.zero_grad()
     loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model,
                                   gamma=params.gamma, device=device)
     loss_v.backward()
     optimizer.step()
     if engine.state.iteration % params.target_net_sync == 0:
         tgt_net.sync()
     if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0:
         for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()):
             engine.state.metrics[f'snr_{layer_idx+1}'] = sigma_l2
     return {
         "loss": loss_v.item(),
     }
Пример #4
0
    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration)
        if engine.state.iteration % params.target_net_sync == 0:
            print('syncing...')
            tgt_net.sync()

        return {'loss': loss_v.item(), "epsilon": batch_generator.epsilon}
    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device)

        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration * args.envs)

        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()

        return {
            'loss': loss_v.item(),
            'epsilon': selector.epsilon,
        }
Пример #6
0
        while True:
            frame_idx += 1
            buffer.populate(1)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'])
            loss_v = common.calc_loss_dqn(batch,
                                          net,
                                          tgt_net.target_model,
                                          gamma=params['gamma'],
                                          cuda=args.cuda)
            loss_v.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

            if frame_idx % 500 == 0:
                for layer_idx, sigma_l2 in enumerate(
                        net.noisy_layers_sigma_snr()):
                    writer.add_scalar("sigma_snr_layer_%d" % (layer_idx + 1),
                                      sigma_l2, frame_idx)
Пример #7
0
            '''
            not train the NNs until buffer's size in more than replay inital
            '''
            if len(buffer)<params["replay_initial"]:
                continue

            optimizer.zero_grad()

            '''
            sample from experience buffer
            '''
            batch=buffer.sample(params["batch_size"])

            '''
            calculate loss between:
            1. actions' q values from main NN using current state
                (WARNING: this is not just the max of the outputs)
            2. output of bell equation:
                max of outputs from TARGET NN using next state*gamma + this step's reward
            '''
            loss_v=common.calc_loss_dqn(batch,net,tgt_net.target_model, gamma=params["gamma"],device=device)
            loss_v.backward()
            optimizer.step()

            '''
            sync target NN with main NN every target_net_sync steps
            '''
            if frame_idx % params["target_net_sync"]==0:
                tgt_net.sync()
Пример #8
0
    buffer = ptan.experience.ExperienceReplayBuffer(experience_source=None, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    exp_queue = mp.Queue(maxsize=PLAY_STEPS * 2)
    play_proc = mp.Process(target=play_func, args=(params, net, args.cuda, exp_queue, cuda_id))
    play_proc.start()

    frame_idx = 0

    while play_proc.is_alive():
        frame_idx += PLAY_STEPS
        for _ in range(PLAY_STEPS):
            exp = exp_queue.get()
            if exp is None:
                play_proc.join()
                break
            buffer._add(exp)

        if len(buffer) < params['replay_initial']:
            continue
        
        optimizer.zero_grad()
        batch = buffer.sample(params['batch_size'])
        loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda, cuda_async=True, cuda_id=cuda_id)
        loss_v.backward()
        optimizer.step()

        if frame_idx % params['target_net_sync'] < PLAY_STEPS:
            tgt_net.sync()
Пример #9
0
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)  # put 1 sample in the experience buffer
            epsilon_tracker.frame(
                frame_idx)  # set epsilon decay for this frame

            new_rewards = exp_source.pop_total_rewards(
            )  # check for finished episode and monitor their total reward
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(
                params['batch_size'])  # get batch from experience buffer
            loss_v = common.calc_loss_dqn(batch,
                                          net,
                                          tgt_net.target_model,
                                          gamma=params['gamma'],
                                          device=device)  # compute loss
            loss_v.backward()  # compute loss derivative
            optimizer.step()  # optimization step

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()  # synchronize target network
            if len(buffer) < params['replay_initial']:
                continue

            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'])
            loss_v = common.calc_loss_dqn(batch,
                                          net,
                                          tgt_net.target_model,
                                          gamma=params['gamma'],
                                          double=False,
                                          device=device)
            loss_v.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

            if frame_idx % 500 == 0:
                for layer_idx, sigma_l2 in enumerate(
                        net.noisy_layers_sigma_snr()):
                    writer.add_scalar("sigma_snr_layer_%d" % (layer_idx + 1),
                                      sigma_l2, frame_idx)

            if frame_idx % EVAL_EVERY_FRAME == 0:
Пример #11
0
            if len(buffer) < params['replay_initial']:
                continue

            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'])
            loss_v = common.calc_loss_dqn(batch,
                                          net,
                                          tgt_net.target_model,
                                          gamma=params['gamma']**args.n,
                                          device=device,
                                          double=args.double)
            loss_v.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()
                save_model(tgt_net.target_model, args.model)

            if frame_idx % EVAL_EVERY_FRAME == 0:
                mean_val = calc_values_of_states(eval_states,
                                                 net,
                                                 device=device)
                writer.add_scalar("values_mean", mean_val, frame_idx)
    with RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            # get latest rewards
            new_rewards = experience_source.pop_total_rewards()
            # new_rewards are empty till the end of the episode
            # so we need to check if the list is empty to pass it to
            # reward_tracker
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx):
                    break
            # till buffer fills up and we can sample continue the loop
            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'])
            loss = calc_loss_dqn(batch, net, target_net.target_model,
                                 params['gamma']**args.n, device)
            loss.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                target_net.sync()
            if frame_idx % 500 == 0:
                for layer_idx, sigma_l2 in \
                        enumerate(net.noisy_layers_sigma_snr()):
                    writer.add_scalar("sigma_snr_layer_%d" % (layer_idx + 1),
                                      sigma_l2, frame_idx)
Пример #13
0
 buffer.populate(1)#where all the magic happens!
 # 1- the buffer will ask the experience source to produce a transision s,a,R,s'
 # 2- the experience source will feed the current observation s to the agent
 # 3- the agent will feed the observation to the network, get the q values of the observation and ask the action selector to decide an action
 # 4- the action selector will generate a random value, compare it to epsilon and decide whether to act greedy or randomly decide the action
 # 5- the action decided is passed to the experience source which feeds it to the environment to get the reward r and new state s' and now, s,a,R,s' are passed to the buffer
 # 6- the buffer takes in the s,a,R,s' data in and kicks out an old one to maintain the same size
 epsilon_tracker.frame(frame_idx)
 new_rewards = exp_source.pop_total_rewards()
 if new_rewards:
     if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
         #If the reward tracker returns True, then it's an indication that the mean reward has reached the score boundary and we can stop our training.
         break
 
 if len(buffer) < params['replay_initial']:# we need to fill the buffer before training so that we have episodes to train on
     continue
 if eval_states is None:
     eval_states = buffer.sample(STATES_TO_EVALUATE)
     eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
     eval_states = np.array(eval_states, copy=False)
 if frame_idx % EVAL_EVERY_FRAME == 0:
     mean_val = common.calc_values_of_states(eval_states, net, device=device)
     writer.add_scalar("values_mean", mean_val, frame_idx)
 # taking a training step!
 optimizer.zero_grad()
 batch = buffer.sample(params['batch_size'])
 loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma']**unrolling_steps, double = double, device=device)
 loss_v.backward()
 optimizer.step()
 if frame_idx % params['target_net_sync'] == 0:#when to sync our target network
     tgt_net.sync()
Пример #14
0
        frame_idx += PLAY_STEPS
        for _ in range(PLAY_STEPS):
            exp = exp_queue.get()
            if exp is None:
                play_proc.join()
                break
            buffer._add(exp)

        if len(buffer) < params['replay_initial']:
            continue

        # train on ERB?
        optimizer.zero_grad()
        optimizer_tm.zero_grad()
        batch = buffer.sample(params['batch_size'])
        loss_v, tm_loss = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'],
                                      cuda=args.cuda, cuda_async=True, fsa=args.fsa, tm_net=tm_net)
        loss_v.backward()
        optimizer.step()

        tm_loss.backward()
        optimizer_tm.step()

        if frame_idx > counter*params['video_interval'] and args.video:
            test_env = wrappers.Monitor(make_env(params),
                                        "{}/frame{}".format(video_path, counter),
                                        video_callable=lambda ep_id: True if ep_id < 3 else False,
                                        force=True)
            obs = test_env.reset()
            test_agent = ptan.agent.PolicyAgent(net, action_selector=ptan.actions.ArgmaxActionSelector(),
                                                device=device, fsa=args.fsa)
            real_done = False
Пример #15
0
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'])
            loss_v = common.calc_loss_dqn(HVALUE ,H_map, batch, net, tgt_net.target_model, gamma=params['gamma'], device=device)
            time.sleep(2)
            loss_v.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=args.n)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'])
            loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model,
                                          gamma=params['gamma']**args.n, device=device)
            loss_v.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()