def __init__(self, opt, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner # 游戏 self.env = None # s_channel = self.env.observation_space.shape[0] # a_space = self.env.action_space # 网络 self.behaviour = ActorCritic(opt).to(device)
def __init__(self, args): """"Constructor which allows the PPO class to initialize the attributes of the class""" self.args = args self.random_seed() # Check if GPU is available via CUDA driver self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # Initialize the actor critic class self.actor_critic = ActorCritic( self.args.nb_states, self.args.nb_actions, self.args.hidden_layer_size).to(self.device) # Define the optimizer used for the optimization of the surrogate loss self.optimizer = self.args.optimizer(self.actor_critic.parameters(), self.args.lr) # For training multiple instances of the env are needed (Shoulder model) self.envs = [self.make_env() for i in range(self.args.num_envs)] self.envs = SubprocVecEnv(self.envs) # To validate the intermediate learning process one test env is needed self.env_test = self.args.env self.env_test.seed(self.args.seed) self.env_test.set_scaling(self.args.output_scaling) # Lists for Tensorboard to visualize learning process during learning self.test_rewards = [] self.loss = [] self.lr = [] self.actor_grad_weight = [] self.action_bang_bang = [] self.lr.append(self.args.lr) # Dump bin files if self.args.play is False: self.output_path = "trained_models" + '/PPO_{}'.format( datetime.now().strftime('%Y%b%d_%H%M%S')) + "/" os.mkdir(self.output_path) self.writer = SummaryWriter(self.output_path)
#!/usr/bin/python3 from pendulum import Pendulum from network import ActorCritic import numpy as np import pickle import os.path import random actorCritic = ActorCritic(Pendulum.state_size, Pendulum.action_size) experiences = [] if os.path.exists('experiences.p'): experiences = pickle.load(open("experiences.p", "rb")) print('experiences', len(experiences)) pendulum = Pendulum(Pendulum.random_theta()) round = 0 score = 1 iteration = 0 cumulative_iterations = 0 action0 = False while round < 27: state0 = pendulum.state() actions = actorCritic.run_actor([state0]) if random.random() < 0.25: action1 = np.random.choice(Pendulum.action_size, 1)[0]
parser.add_argument('--gamma', default=0.99, type=float, help='rewards discount factor') parser.add_argument('--entropy_weight', default=0.0001, type=float) parser.add_argument('--alpha', default=0.95, type=float) parser.add_argument('--type', default='notrpo', type=str, help='iftrpo') parser.add_argument('--render', action='store_true', help='render') args = parser.parse_args() # print(args) torch.manual_seed(args.seed) env = gym.make("CartPole-v0") replay_buffer = ReplayBuffer(args.capacity, args.max_episode_length) model = ActorCritic(env.observation_space.shape[0], env.action_space.n).cuda() average_model = ActorCritic(env.observation_space.shape[0], env.action_space.n).cuda() optimizer = optim.Adam(model.parameters()) frame_idx = 0 test_rewards = [] episode_count = 0 step_count = 0 state = env.reset() running_rew = 0 plotcount = 0 while frame_idx < args.max_frames:
'action0': action0, 'state1': state1, 'action1': action1, 'score1': score1 } experiences.append(experience) action0 = action1 # print(action1, actions, state1[Pendulum.state_size - 1]) cumulative_score_run += score1 iterations += 1 print('score final ', score, ' average ', cumulative_score_run / iterations, ' initial theta ', pendulum.initial_theta, ' iterations ', iterations) cumulative_score += score1 cumulative_iterations += iterations pendulum = Pendulum(Pendulum.random_theta()) return cumulative_score / count, cumulative_iterations / count actorCritic = ActorCritic(Pendulum.state_size, Pendulum.action_size) score, iterations = run_test(27, actorCritic) print('score', score, 'iterations', iterations) pickle.dump(experiences, open("experiences.p", "wb"))
class PPO(object): """Main PPO class""" def __init__(self, args): """"Constructor which allows the PPO class to initialize the attributes of the class""" self.args = args self.random_seed() # Check if GPU is available via CUDA driver self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # Initialize the actor critic class self.actor_critic = ActorCritic( self.args.nb_states, self.args.nb_actions, self.args.hidden_layer_size).to(self.device) # Define the optimizer used for the optimization of the surrogate loss self.optimizer = self.args.optimizer(self.actor_critic.parameters(), self.args.lr) # For training multiple instances of the env are needed (Shoulder model) self.envs = [self.make_env() for i in range(self.args.num_envs)] self.envs = SubprocVecEnv(self.envs) # To validate the intermediate learning process one test env is needed self.env_test = self.args.env self.env_test.seed(self.args.seed) self.env_test.set_scaling(self.args.output_scaling) # Lists for Tensorboard to visualize learning process during learning self.test_rewards = [] self.loss = [] self.lr = [] self.actor_grad_weight = [] self.action_bang_bang = [] self.lr.append(self.args.lr) # Dump bin files if self.args.play is False: self.output_path = "trained_models" + '/PPO_{}'.format( datetime.now().strftime('%Y%b%d_%H%M%S')) + "/" os.mkdir(self.output_path) self.writer = SummaryWriter(self.output_path) #self.delta = (self.args.lr-self.args.lr_end)/1e6 def train(self): """Main training function""" frame_idx = 0 state = self.envs.reset() mean_100_reward = -np.inf self.info() while frame_idx < self.args.max_frames: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = self.args.entropy for _ in range(self.args.nb_steps): state = torch.FloatTensor(state).to(self.device) dist, value = self.actor_critic(state) action = dist.sample() # Make sure action is loaded to CPU (not GPU) next_state, reward, done, _ = self.envs.step( action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append( torch.FloatTensor(reward).unsqueeze(1).to(self.device)) masks.append( torch.FloatTensor(1 - done).unsqueeze(1).to(self.device)) states.append(state) actions.append(action) state = next_state frame_idx += 1 #self.scheduler() # Evaluate training process and write data to tensorboard if frame_idx % 1000 == 0: test_reward = np.mean( [self.test_env(self.args.vis) for _ in range(10)]) self.test_rewards.append(test_reward) if self.args.play is False: print("Mean reward: ", np.round(np.mean(self.test_rewards[-101:-1]), 0)) if mean_100_reward < np.round( np.mean(self.test_rewards[-101:-1]), 0): mean_100_reward = np.round( np.mean(self.test_rewards[-101:-1]), 0) self.save_network(mean_100_reward) if len(self.test_rewards) >= 10: self.writer.add_scalar( 'data/reward', np.mean(self.test_rewards[-11:-1]), frame_idx * self.args.num_envs) self.writer.add_scalar( 'data/ppo_loss', np.mean(self.loss[-11:-1]), frame_idx * self.args.num_envs) self.writer.add_scalar( 'data/nb_actions_outside_range', np.mean(self.action_bang_bang[-11:-1]), frame_idx * self.args.num_envs) # if test_reward > threshold_reward: early_stop = True next_state = torch.FloatTensor(next_state).to(self.device) _, next_value = self.actor_critic(next_state) returns = self.calc_gae(next_value, rewards, masks, values, self.args.gamma, self.args.tau) # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size, states, actions, log_probs, returns, advantage, self.args.clip) def make_env(self): # Private trunk function for calling the SubprocVecEnv class def _trunk(): env = self.args.env # in this simple case the class TestEnv() is called (see openAI for more envs) env.seed(self.args.seed) env.set_scaling(self.args.output_scaling) return env return _trunk def test_env(self, vis=False): state = self.env_test.reset() if vis: self.env_test.render() done = False total_reward = 0 action_bang_bang = 0 step = 0 while not done: step += 1 state = torch.FloatTensor(state).unsqueeze(0).to(self.device) dist, _ = self.actor_critic(state) action = dist.sample().cpu().numpy()[0] force = action * self.args.output_scaling next_state, reward, done, _ = self.env_test.step(action) if force > 0.5 or force < -0.5: action_bang_bang += 1 state = next_state if vis: self.env_test.render() total_reward += reward self.action_bang_bang.append(action_bang_bang / step) return total_reward # Plain functions except that one can call them from an instance or the class @staticmethod def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95): values = values + [next_value] gae = 0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[ step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return returns @staticmethod def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], actions[rand_ids, :], log_probs[ rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2): for _ in range(ppo_epochs): for state, action, old_log_probs, return_, advantage in self.ppo_iter( mini_batch_size, states, actions, log_probs, returns, advantages): dist, value = self.actor_critic(state) entropy = dist.entropy().mean() new_log_probs = dist.log_prob(action) ratio = (new_log_probs - old_log_probs).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy self.loss.append(loss.item()) # Important step: self.optimizer.zero_grad() #pdb.set_trace() loss.backward() if self.args.grad_norm is not None: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.args.grad_norm) self.optimizer.step() def save_network(self, reward): network_path = self.output_path + "/network" + str(reward) pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb")) def load_network(self, path): network_new = pickle.load(open(path, "rb")) self.actor_critic.load_state_dict(network_new) def random_seed(self): torch.manual_seed(self.args.seed) random.seed(self.args.seed) np.random.seed(self.args.seed) def scheduler(self): for g in self.optimizer.param_groups: lr = g["lr"] if self.args.lr_end > lr: lr = self.args.lr_end else: lr -= self.delta self.lr.append(lr) g["lr"] = lr def info(self): fhandler = logging.FileHandler(filename=self.output_path + '/mylog.log', mode='a') logger.addHandler(fhandler) logger.info("--- INFO ---") logger.info("args: {}".format(self.args))
argparse.add_argument('--update_intervals', type=int) argparse.add_argument('--gifs_save_interval', type=int) argparse.add_argument('--gradient_clipping', type=float) argparse.add_argument('--render', action='store_true') argparse.add_argument('--critic_coefficient', type=float) args = argparse.parse_args() print('Creating {} environments for parallel processing'.format( args.threads)) args.environments = [ gym.make(args.environment) for _ in range(args.threads) ] args.optimizer = tf.keras.optimizers.SGD(args.learning_rate) args.actor_critic = ActorCritic( args.environments[0].action_space.n, input_shape=args.environments[0].observation_space.sample().shape) args.actor_critic.set_threads(args.threads) sample_input = process_screen( args.environments[0].observation_space.sample()) args.actor_critic(sample_input, 0) args.actor_critic.reset_thread_states(0) if args.checkpoint_path != None: args.actor_critic.load_weights(args.checkpoint_path) args.summary_writer = tf.summary.create_file_writer(args.log_dir) run_training_procedure(args)
def __init__(self, opt, q_batch): self.opt = opt self.q_batch = q_batch self.network = ActorCritic(opt).to(device) self.optimizer = Adam(self.network.parameters(), lr=opt.lr) self.network.share_memory()
class Learner(object): def __init__(self, opt, q_batch): self.opt = opt self.q_batch = q_batch self.network = ActorCritic(opt).to(device) self.optimizer = Adam(self.network.parameters(), lr=opt.lr) self.network.share_memory() def learning(self): torch.manual_seed(self.opt.seed) coef_hat = torch.Tensor([[self.opt.coef_hat]]).to(device) rho_hat = torch.Tensor([[self.opt.rho_hat]]).to(device) while True: # batch-trace # s[batch, n_step+1, 3, width, height] # a[batch, n_step, a_space] # rew[batch, n_step] # a_prob[batch, n_step, a_space] s, a, rew, prob = self.q_batch.get(block=True) ########################### # variables we need later # ########################### v, coef, rho, entropies, log_prob = [], [], [], [], [] cx = torch.zeros(self.opt.batch_size, 256).to(device) hx = torch.zeros(self.opt.batch_size, 256).to(device) for step in range(s.size(1)): # value[batch] # logit[batch, 12] value, logit, (hx, cx) = self.network((s[:, step, ...], (hx, cx))) v.append(value) if step >= a.size( 1 ): # noted that s[, n_step+1, ...] but a[, n_step,...] break # loop for n_step+1 because v in n_step+1 is needed. # π/μ[batch] # TODO: cumprod might produce runtime problem logit_a = a[:, step, :] * logit.detach() + ( 1 - a[:, step, :]) * (1 - logit.detach()) prob_a = a[:, step, :] * prob[:, step, :] + ( 1 - a[:, step, :]) * (1 - prob[:, step, :]) is_rate = torch.cumprod(logit_a / (prob_a + 1e-6), dim=1)[:, -1] coef.append(torch.min(coef_hat, is_rate)) rho.append(torch.min(rho_hat, is_rate)) # enpy_aspace[batch, 12] # calculating the entropy[batch, 1] # more specifically there are [a_space] entropy for each batch, sum over them here. # noted that ~do not~ use detach here enpy_aspace = -torch.log(logit) * logit - torch.log( 1 - logit) * (1 - logit) enpy = (enpy_aspace).sum(dim=1, keepdim=True) entropies.append(enpy) # calculating the prob that the action is taken by target policy # and the prob_pi_a[batch, 12] and log_prob[batch, 1] of this action # noted that ~do not~ use detach here prob_pi_a = (a[:, step, :] * logit) + (1 - a[:, step, :]) * (1 - logit) log_prob_pi_a = torch.log(prob_pi_a).sum(dim=1, keepdim=True) log_prob.append(log_prob_pi_a) # prob_pi_a = torch.cumprod(prob_pi_a, dim=1)[:, -1:] # log_prob_pi_a = torch.log(prob_pi_a) #################### # calculating loss # #################### policy_loss = 0 value_loss = 0 # gae = torch.zeros(self.opt.batch_size, 1) for rev_step in reversed(range(s.size(1) - 1)): # compute v_(s+1)[batch] for policy gradient fix_vp = rew[:, rev_step] + self.opt.gamma * ( v[rev_step + 1] + value_loss) - v[rev_step] # value_loss[batch] td = rew[:, rev_step] + self.opt.gamma * v[rev_step + 1] - v[rev_step] value_loss = self.opt.gamma * coef[ rev_step] * value_loss + rho[rev_step] * td # policy_loss = policy_loss - log_probs[i] * Variable(gae) # the td must be detach from network-v # # dalta_t[batch] # delta_t = rew[:, rev_step] + self.opt.gamma * v[rev_step + 1] - v[rev_step] # gae = gae * self.opt.gamma + delta_t.detach() policy_loss = policy_loss \ - rho[rev_step] * log_prob[rev_step] * fix_vp.detach() \ - self.opt.entropy_coef * entropies[rev_step] self.optimizer.zero_grad() policy_loss = policy_loss.sum() value_loss = value_loss.sum() loss = policy_loss + self.opt.value_loss_coef * value_loss loss.backward() torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.opt.max_grad_norm) print("v_loss {:.3f} p_loss {:.3f}".format(value_loss.item(), policy_loss.item())) self.optimizer.step()
class Actor(object): def __init__(self, opt, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner # 游戏 self.env = None # s_channel = self.env.observation_space.shape[0] # a_space = self.env.action_space # 网络 self.behaviour = ActorCritic(opt).to(device) def performing(self, rank): torch.manual_seed(self.opt.seed) # 每个线程初始化环境 self.env = retro.make(game=self.opt.env) self.env.seed(self.opt.seed + rank) s = self.env.reset() s = transform(s).unsqueeze(dim=0).to(device) episode_length = 0 r_sum = 0. done = True while True: # apply # print(type(self.learner.network.state_dict())) self.behaviour.load_state_dict(self.learner.network.state_dict()) # LSTM if done: cx = torch.zeros(1, 256).to(device) hx = torch.zeros(1, 256).to(device) else: cx = cx.detach() hx = hx.detach() trace_s, trace_a, trace_rew, trace_aprob = [], [], [], [] # collect n-step for n in range(self.opt.n_step): episode_length += 1 # add to trace - 0 trace_s.append(s) value, logit, (hx, cx) = self.behaviour((s, (hx, cx))) logit = logit.detach() action = torch.bernoulli(logit) s, rew, done, info = self.env.step( action.squeeze().to("cpu").numpy().astype(np.int8)) r_sum += rew s = transform(s).unsqueeze(dim=0).to(device) rew = torch.Tensor([rew]).to(device) done = done or episode_length >= self.opt.max_episode_length # add to trace - 1 trace_a.append(action) trace_rew.append(rew) trace_aprob.append(logit) if done: print("over, reward {}".format(r_sum)) r_sum = 0 episode_length = 0 # game over punishment trace_rew[-1] = torch.Tensor([-200.]).to(device) break # add to trace - 2 trace_s.append(s) # stack n-step # s[n_step+1, 3, width, height] # a[n_step, a_space] # rew[n_step] # a_prob[n_step] trace_s = torch.cat(tuple(trace_s), dim=0) zeros = torch.zeros((self.opt.n_step + 1, ) + trace_s.size()[1:]).to(device) # expand zeros[:trace_s.size(0)] += trace_s trace_s = zeros trace_a = torch.cat(tuple(trace_a), dim=0) zeros = torch.zeros((self.opt.n_step, ) + trace_a.size()[1:]).to( device) # expand zeros[:trace_a.size(0)] += trace_a trace_a = zeros trace_rew = torch.cat(tuple(trace_rew), dim=0) zeros = torch.zeros(self.opt.n_step).to(device) # expand zeros[:trace_rew.size(0)] += trace_rew trace_rew = zeros trace_aprob = torch.cat(tuple(trace_aprob), dim=0) zeros = torch.zeros((self.opt.n_step, ) + trace_aprob.size()[1:]).to(device) # expand zeros[:trace_aprob.size(0)] += trace_aprob trace_aprob = zeros # submit trace to queue self.q_trace.put((trace_s.to("cpu"), trace_a.to("cpu"), trace_rew.to("cpu"), trace_aprob.to("cpu")), block=True) if done: s = self.env.reset() s = transform(s).unsqueeze(dim=0).to(device)
#!/usr/bin/python3 from pendulum import Pendulum from network import ActorCritic import numpy as np import pickle import os.path import random actorCritic = ActorCritic(Pendulum.state_size, Pendulum.action_size) experiences = [] if os.path.exists('experiences.p'): experiences = pickle.load(open("experiences.p", "rb")) print('experiences ', len(experiences)) pendulum = Pendulum(Pendulum.random_theta()) round = 0 iteration = 0 action0 = False while round < 27: state0 = pendulum.state() actions = actorCritic.run_actor([state0]) if random.random() < 0.5: action1 = np.random.choice(Pendulum.action_size, 1)[0] else: action1 = np.argmax(actions)
def _func(): #lock for stopping training during validation if self.val_mode: #main thread have locked the following lock, release when val is done self.val_lock.acquire(), self.val_lock.release() #signal we enter area that modify network with self.val_counter.get_lock(): self.val_counter.value += 1 #get batch this_x, this_y, this_comma, this_punctuation, this_addMax, mask, seqs, q_a_words = self.train_generator( ) if self.mode == 1: return_mode = 1 pred_loss, pred, entropy_loss = self.sampler.consume_sample_full_read( this_x, seqs, this_y, self.scale_pred, q_a_words) prediction_correct, reward_pred = self.prediction_correct( pred, this_y) #signal we leave area that modify network with self.val_counter.get_lock(): self.val_counter.value -= 1 return pred_loss, prediction_correct, np.zeros(1,dtype=np.float32),np.zeros(1,dtype=np.float32),\ np.zeros(1,dtype=np.float32),entropy_loss,np.zeros(1,dtype=np.float32) elif self.mode == 2: return_mode = 2 batch_size = this_x.shape[1] self.i += 1 if self.i % self.update_after == 0: with self.sync_lock: self.sampler.sync() read_words, action_agent_1, value_agent_1, action_agent_2, value_agent_2, predictions, is_not_done,\ probs_agent_1, probs_agent_2= \ self.sampler.get_sample(this_x, seqs, this_comma, this_punctuation, q_a_words) #construct new input to the training network, conisisting only of the read words is_not_done = np.append(is_not_done, np.zeros((1, batch_size), dtype=np.bool), axis=0) time_length_reduced_input = np.argmin(is_not_done, axis=0) max_len = np.max(time_length_reduced_input) #make them 1 longer to avoid out of bounds errors when looping in lstm reduced_x = np.zeros(shape=(max_len + 1, batch_size), dtype=np.int32) reduced_action_1 = np.zeros(shape=(max_len, batch_size), dtype=np.int32) reduced_value_1 = np.zeros(shape=(max_len, batch_size), dtype=np.int32) reduced_action_2 = np.zeros(shape=(max_len, batch_size), dtype=np.int32) reduced_value_2 = np.zeros(shape=(max_len, batch_size), dtype=np.int32) reduced_probs_1 = np.zeros(shape=(max_len, batch_size), dtype=np.float32) reduced_probs_2 = np.zeros(shape=(max_len, batch_size), dtype=np.float32) number_skips = np.zeros(shape=(batch_size), dtype=np.int) for (i, max) in enumerate(time_length_reduced_input): reduced_x[:max, i] = this_x[read_words[:max, i], i] reduced_action_1[:max, i] = action_agent_1[:max, i] reduced_value_1[:max, i] = value_agent_1[:max, i] reduced_action_2[:max, i] = action_agent_2[:max, i] reduced_value_2[:max, i] = value_agent_1[:max, i] reduced_probs_1[:max, i] = probs_agent_1[:max, i] reduced_probs_2[:max, i] = probs_agent_2[:max, i] number_skips[i] = np.sum( action_agent_1[:max, i] == ActorCritic.agent_1_skip()) #compute the reward prediction_correct, reward_pred = self.prediction_correct( predictions, this_y) rolling_reward_agent_1, rolling_reward_agent_2, reward_at_end, is_not_done = self.rolling_reward( reduced_action_1, reduced_action_2, time_length_reduced_input, seqs, prediction_correct) #t_flip = (((prediction_correct == 0) * -1) + (prediction_correct == 1)) #rolling_reward_agent_1 = rolling_reward_agent_1 * t_flip #rolling_reward_agent_2 = rolling_reward_agent_2 * t_flip #reward_at_end = reward_at_end * t_flip #print(rolling_reward_agent_1.shape,rolling_reward_agent_2.shape, reward_at_end.shape) final_reward = reward_at_end + reward_pred #if prediction correct, use is_not_done to also remove the updates in the agent for these #print(is_not_done*prediction_correct) #to_add_tmp = prediction_correct==0 * 0.5 #is_not_done = is_not_done*(prediction_correct+to_add_tmp) #compute actual advantage: sampled_advantage_1 = final_reward - rolling_reward_agent_1 sampled_advantage_2 = final_reward - rolling_reward_agent_2 pred_loss, actor_loss, critic_loss, entropy_loss = self.sampler.consume_sample( reduced_x, time_length_reduced_input, sampled_advantage_1, reduced_action_1, sampled_advantage_2, reduced_action_2, is_not_done, this_y, final_reward, self.scale_pred, self.scale_critic, self.scale_actor, q_a_words, reduced_probs_1, reduced_probs_2, embedding_train=self.embed_partial) #general logging of behavior self.runned_batches += 1 reading_percentage = (time_length_reduced_input) / seqs #start with advantage based directly on reward #signal we leave area that modify network with self.val_counter.get_lock(): self.val_counter.value -= 1 return pred_loss, prediction_correct, np.ndarray.astype(reading_percentage,np.float32), actor_loss,\ critic_loss, entropy_loss, final_reward
batch_size = 50 is_Q_A = False sync_lock = Lock() val_lock = Lock() val_counter = Value("i", 0) NUMBER_THREADS = 3 tf.reset_default_graph() dg = DataGenerator( dataset_name, folder_data, folder_result, batch_size, is_Q_A, w2v=None ) #, w2v=None #no embedding, as they are slow to use when just debugging ac_consumer = ActorCritic(dg.batch_size, dg.vocab_size, dg.number_targets, scope_name="consumer", device="/gpu:0") samplers = [ ActorCritic(dg.batch_size, dg.vocab_size, dg.number_targets, consumer=ac_consumer, scope_name="sampler_" + str(i), device="/cpu:0") for i in range(NUMBER_THREADS) ] init_op = [ tf.global_variables_initializer(), tf.local_variables_initializer() ] sess = tf.Session(config=tf.ConfigProto(