def run(args): env = gym.make(args.env) device = torch.device(args.device) # 1. Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) random.seed(args.seed) env.seed(args.seed) # 2. Create nets. state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] hidden_sizes = (256, 256) ac = ActorCritic(state_size, action_size, hidden_sizes).to(device) ac_target = ActorCritic(state_size, action_size, hidden_sizes).to(device) hard_update(ac, ac_target) # env_sampler = EnvSampler(env, max_episode_step=4000, capacity=1e6) env_sampler = EnvSampler2(env, gamma=args.gamma1, capacity=1e6) alg = SAC(ac, ac_target, gamma=args.gamma2, alpha=0.2, q_lr=1e-3, pi_lr=1e-3, target_lr=5e-3, device=device) def get_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) return ac_target.get_action(state) def get_mean_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) return ac_target.get_action(state, deterministic=True) start_time = time() for _ in range(args.start_steps): env_sampler.addSample() print("Warmup uses {}s.".format(time() - start_time)) for step in range(1, args.total_steps + 1): env_sampler.addSample(get_action) if step % args.update_every == 0: for _ in range(args.update_every): batch = env_sampler.sample(args.batch_size) losses = alg.update(*batch) if step % args.test_every == 0: test_reward = env_sampler.test(get_mean_action) yield (step, test_reward, *losses) torch.save(ac.pi.state_dict(), './env_{}_pi_net.pth.tar'.format(args.env))
def __init__(self, nb_actions, learning_rate, gamma, hidden_size, model_input_size, entropy_coeff_start, entropy_coeff_end, entropy_coeff_anneal, continuous): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_actions = nb_actions self.gamma = gamma self.continuous = continuous self.learning_rate = learning_rate self.entropy_coefficient_start = entropy_coeff_start self.entropy_coefficient_end = entropy_coeff_end self.entropy_coefficient_anneal = entropy_coeff_anneal self.step_no = 0 if self.continuous: self.model = ActorCriticContinuous(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to( self.device) else: self.model = ActorCritic(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to(self.device) self.hidden_size = hidden_size self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) self.loss_function = torch.nn.MSELoss() self.memory = [] self.ICM = ICM(model_input_size, nb_actions) self.ICM.train()
def global_test(global_model, device, args, model_type, delay=0.03): world = args.world stage = args.stage env = create_env(world, stage) device = device state = env.reset() state = (env.reset()).to(device, dtype=torch.float) state = state.view(1, 1, 80, 80) done = True if (model_type == "LSTM"): model = ActorCritic_LSTM().to(device) else: model = ActorCritic().to(device) model.eval() model.load_state_dict(global_model.state_dict()) while (True): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() h_0 = h_0.to(device) c_0 = c_0.to(device) env.render() p, _, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(p, dim=1) action = torch.argmax(policy) next_state, _, done, info = env.step(action.item()) next_state = (next_state).to(device, dtype=torch.float) next_state = next_state.view(1, 1, 80, 80) state = next_state if (done): if (info['flag_get']): break state = env.reset() state = state.to(device) state = state.view(1, 1, 80, 80) model.load_state_dict(global_model.state_dict()) time.sleep(delay) print('Success clear {}-{}'.format(world, stage))
def policy_action(params: flax.core.frozen_dict.FrozenDict, module: models.ActorCritic, state: onp.ndarray): """Forward pass of the network. Args: params: the parameters of the actor-critic model module: the actor-critic model state: the input for the forward pass Returns: out: a tuple (log_probabilities, values) """ out = module.apply({'params': params}, state) return out
board_size = 40 offset = 20 width = window.width - offset height = window.height - offset board_unit = min(width // board_size, height // board_size) x1_board = window.width // 2 - (board_size // 2 + 1) * board_unit x2_board = x1_board + (board_size + 1) * board_unit y1_board = window.height // 2 - (board_size // 2 + 1) * board_unit y2_board = y1_board + (board_size + 1) * board_unit print(x1_board, x2_board, y1_board, y2_board) env = TrainEnvSingle() game = env.game model = ActorCritic() model.load_state_dict(torch.load("weights.pt")) model = model.eval() state, invalid = env.reset() dist, value = model(state, invalid) q_values = dist.probs.tolist()[0] def take_action(dt): pass def reload_model(dt): global model model.load_state_dict(torch.load("weights.pt")) print("Reloaded model")
def main(args): print(f" Session ID: {args.uuid}") # logging log_dir = f'logs/{args.env_name}/{args.model_id}/{args.uuid}/' args_logger = setup_logger('args', log_dir, f'args.log') env_logger = setup_logger('env', log_dir, f'env.log') if args.debug: debug.packages() os.environ['OMP_NUM_THREADS'] = "1" if torch.cuda.is_available(): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" devices = ",".join([str(i) for i in range(torch.cuda.device_count())]) os.environ["CUDA_VISIBLE_DEVICES"] = devices args_logger.info(vars(args)) env_logger.info(vars(os.environ)) env = create_atari_environment(args.env_name) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space.n) if torch.cuda.is_available(): shared_model = shared_model.cuda() shared_model.share_memory() optimizer = SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() if args.load_model: # TODO Load model before initializing optimizer checkpoint_file = f"{args.env_name}/{args.model_id}_{args.algorithm}_params.tar" checkpoint = restore_checkpoint(checkpoint_file) assert args.env_name == checkpoint['env'], \ "Checkpoint is for different environment" args.model_id = checkpoint['id'] args.start_step = checkpoint['step'] print("Loading model from checkpoint...") print(f"Environment: {args.env_name}") print(f" Agent: {args.model_id}") print(f" Start: Step {args.start_step}") shared_model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: print(f"Environment: {args.env_name}") print(f" Agent: {args.model_id}") torch.manual_seed(args.seed) print( FontColor.BLUE + \ f"CPUs: {mp.cpu_count(): 3d} | " + \ f"GPUs: {None if not torch.cuda.is_available() else torch.cuda.device_count()}" + \ FontColor.END ) processes = [] counter = mp.Value('i', 0) lock = mp.Lock() # Queue training processes num_processes = args.num_processes no_sample = args.non_sample # count of non-sampling processes if args.num_processes > 1: num_processes = args.num_processes - 1 samplers = num_processes - no_sample for rank in range(0, num_processes): device = 'cpu' if torch.cuda.is_available(): device = 0 # TODO: Need to move to distributed to handle multigpu if rank < samplers: # random action p = mp.Process( target=train, args=(rank, args, shared_model, counter, lock, optimizer, device), ) else: # best action p = mp.Process( target=train, args=(rank, args, shared_model, counter, lock, optimizer, device, False), ) p.start() time.sleep(1.) processes.append(p) # Queue test process p = mp.Process(target=test, args=(args.num_processes, args, shared_model, counter, 0)) p.start() processes.append(p) for p in processes: p.join()
def train(rank, args, shared_model, opt_ac, can_save, shared_obs_stats): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = [1] * 48 if args.render and can_save: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) ac_net.zero_grad() num_steps = 0 reward_batch = 0 num_episodes = 0 #Tot_loss = 0 #Tot_num = while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) last_state = process_observation(state) state = process_observation(state) last_state, state = transform_observation(last_state, state) state = numpy.array(state) #global last_state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = Variable(torch.Tensor(state).unsqueeze(0)) shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) state = state.data[0].numpy() #state = running_state(state) reward_sum = 0 #timer = time.time() for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(state) print(action) print('ERROR') raise RuntimeError('action NaN problem') #print(action) #print("------------------------") #timer = time.time() BB = numpy.append(action, action) #print(BB) reward = 0 if args.skip: #env.step(action) _, A, _, _ = env.step(BB) reward += A _, A, _, _ = env.step(BB) reward += A next_state, A, done, _ = env.step(BB) reward += A next_state = process_observation(next_state) last_state, next_state = transform_observation( last_state, next_state) next_state = numpy.array(next_state) reward_sum += reward #print('env:') #print(time.time()-timer) #last_state ,next_state = update_observation(last_state,next_state) #next_state = running_state(next_state) next_state = Variable(torch.Tensor(next_state).unsqueeze(0)) shared_obs_stats.observes(next_state) next_state = shared_obs_stats.normalize(next_state) next_state = next_state.data[0].numpy() #print(next_state[41:82]) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() #print('env:') #print(time.time()-timer) #timer = time.time() update_params_actor_critic(batch, args, shared_model, ac_net, opt_ac) #print('backpropagate:') #print(time.time()-timer) epoch = i_episode if (i_episode % args.log_interval == 0) and (rank == 0): print('TrainEpisode {}\tLast reward: {}\tAverage reward {:.2f}'. format(i_episode, reward_sum, reward_batch)) if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': ac_net.state_dict(), 'optimizer': opt_ac, 'obs': shared_obs_stats, }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': ac_net.state_dict(), 'optimizer': opt_ac, 'obs': shared_obs_stats, }, PATH_TO_MODEL, epoch)
from functools import partial import torch from torch import autograd, optim from torch.distributions import Independent, Normal from torch.distributions.kl import kl_divergence from torch.nn.utils import parameters_to_vector, vector_to_parameters from tqdm import tqdm from env import Env from hyperparams import BACKTRACK_COEFF, BACKTRACK_ITERS, ON_POLICY_BATCH_SIZE as BATCH_SIZE, CONJUGATE_GRADIENT_ITERS, DAMPING_COEFF, DISCOUNT, HIDDEN_SIZE, KL_LIMIT, LEARNING_RATE, MAX_STEPS, TRACE_DECAY from models import ActorCritic from utils import plot env = Env() agent = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0], HIDDEN_SIZE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def hessian_vector_product(d_kl, x): g = parameters_to_vector( autograd.grad(d_kl, agent.actor.parameters(), create_graph=True)) return parameters_to_vector( autograd.grad((g * x.detach()).sum(), agent.actor.parameters(), retain_graph=True)) + DAMPING_COEFF * x def conjugate_gradient(Ax, b): x = torch.zeros_like(b) r = b - Ax(x) # Residual p = r # Conjugate vector
import torch from torch import optim from tqdm import tqdm from env import Env from hyperparams import ON_POLICY_BATCH_SIZE as BATCH_SIZE, DISCOUNT, HIDDEN_SIZE, INITIAL_POLICY_LOG_STD_DEV, LEARNING_RATE, MAX_STEPS, TRACE_DECAY, VALUE_EPOCHS from models import ActorCritic from utils import plot env = Env() agent = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0], HIDDEN_SIZE, initial_policy_log_std_dev=INITIAL_POLICY_LOG_STD_DEV) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) state, done, total_reward, D = env.reset(), False, 0, [] pbar = tqdm(range(1, MAX_STEPS + 1), unit_scale=1, smoothing=0) for step in pbar: # Collect set of trajectories D by running policy π in the environment policy, value = agent(state) action = policy.sample() log_prob_action = policy.log_prob(action) next_state, reward, done = env.step(action) total_reward += reward D.append({ 'state': state, 'action': action, 'reward': torch.tensor([reward]), 'done': torch.tensor([done], dtype=torch.float32), 'log_prob_action': log_prob_action,
from functools import partial import torch from torch import autograd, optim from torch.distributions import Normal from torch.distributions.kl import kl_divergence from torch.nn.utils import parameters_to_vector, vector_to_parameters from tqdm import tqdm from env import Env from hyperparams import BACKTRACK_COEFF, BACKTRACK_ITERS, ON_POLICY_BATCH_SIZE as BATCH_SIZE, CONJUGATE_GRADIENT_ITERS, DAMPING_COEFF, DISCOUNT, HIDDEN_SIZE, KL_LIMIT, LEARNING_RATE, MAX_STEPS, TRACE_DECAY from models import ActorCritic from utils import plot env = Env() agent = ActorCritic(HIDDEN_SIZE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def hessian_vector_product(d_kl, x): g = parameters_to_vector( autograd.grad(d_kl, agent.actor.parameters(), create_graph=True)) return parameters_to_vector( autograd.grad((g * x.detach()).sum(), agent.actor.parameters(), retain_graph=True)) + DAMPING_COEFF * x def conjugate_gradient(Ax, b): x = torch.zeros_like(b) r = b - Ax(x) # Residual p = r # Conjugate vector r_dot_old = torch.dot(r, r)
default=5, metavar='IE', help='Imitation learning epochs') parser.add_argument('--imitation-replay-size', type=int, default=1, metavar='IRS', help='Imitation learning trajectory replay size') args = parser.parse_args() torch.manual_seed(args.seed) os.makedirs('results', exist_ok=True) # Set up environment and models env = CartPoleEnv() env.seed(args.seed) agent = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.hidden_size) agent_optimiser = optim.RMSprop(agent.parameters(), lr=args.learning_rate) if args.imitation: # Set up expert trajectories dataset expert_trajectories = torch.load('expert_trajectories.pth') expert_trajectories = { k: torch.cat([trajectory[k] for trajectory in expert_trajectories], dim=0) for k in expert_trajectories[0].keys() } # Flatten expert trajectories expert_trajectories = TransitionDataset(expert_trajectories) # Set up discriminator if args.imitation in ['AIRL', 'GAIL']: if args.imitation == 'AIRL': discriminator = AIRLDiscriminator(env.observation_space.shape[0], env.action_space.n,
# create env history, abbreviation = create_env_input() env = PortfolioEnv(history, abbreviation) #env = ch.envs.Logger(env, interval=20) env = ch.envs.Torch(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_all = env.observation_space.shape assert action_size == number_asset + 1 input_size = features_all - 1 net = ActorCritic(input_size=input_size, hidden_size=50, action_size=action_size) net_tgt = ActorCritic(input_size=input_size, hidden_size=50, action_size=action_size) net_tgt.eval() print(net_tgt) net_tgt.load_state_dict(net.state_dict()) # create replay replay = ch.ExperienceReplay() # create loss function criterion_mse = nn.MSELoss() # create optimizer
num_inputs = envs.observation_space num_outputs = envs.action_space # Hyper-parameters NB_STEP = 128 UPDATE_EPOCH = 10 MINI_BATCH_SIZE = 512 SIZES = [64] GAMMA = 0.99 LAMBDA = 0.95 EPSILON = 0.2 REWARD_THRESHOLD = 190 model = ActorCritic(num_inputs, num_outputs, SIZES) frame_idx = 0 test_rewards = [] #env_render = False state = envs.reset() early_stop = False PATH = "saved_models/model_ppo_pendulum.pt" while not early_stop: log_probs = [] values = [] states = [] actions = []
def make_env(env_num): return Snake(env_num, board_size=40, terminal_step=None) write = True save = True load = False num_steps = 200 env_num = 128 worker_num = 2 mini_batch_size = 64 ppo_epochs = 4 if write: writer = SummaryWriter() model = ActorCritic() if load: model.load_state_dict(torch.load("weights.pt")) model = model.cuda() opt = torch.optim.AdamW(model.parameters()) envs_fns = [make_env for _ in range(worker_num)] envs = SubprocWrapper(envs_fns, env_num) step = 0 while True: log_probs = [] values = [] states = [] invalids = [] rewards = []
def main(args): # create environment env = gym.make(args.env) env.seed(args.seed) obs_dim = env.observation_space.shape[0] if isinstance(env.action_space, Discrete): discrete = True act_dim = env.action_space.n else: discrete = False act_dim = env.action_space.shape[0] # actor critic ac = ActorCritic(obs_dim, act_dim, discrete).to(args.device) print('Number of parameters', count_vars(ac)) # Set up experience buffer steps_per_epoch = int(args.steps_per_epoch) buf = PGBuffer(obs_dim, act_dim, discrete, steps_per_epoch, args) logs = defaultdict(lambda: []) writer = SummaryWriter(args_to_str(args)) gif_frames = [] # Set up function for computing policy loss def compute_loss_pi(batch): obs, act, psi, logp_old = batch['obs'], batch['act'], batch['psi'], batch['logp'] pi, logp = ac.pi(obs, act) # Policy loss if args.loss_mode == 'vpg': # TODO (Task 2): implement vanilla policy gradient loss elif args.loss_mode == 'ppo': # TODO (Task 4): implement clipped PPO loss else: raise Exception('Invalid loss_mode option', args.loss_mode) # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(batch): obs, ret = batch['obs'], batch['ret'] v = ac.v(obs) # TODO: (Task 2): compute value function loss return loss_v # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=args.pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=args.v_lr) # Set up update function def update(): batch = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(batch) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(batch).item() # Policy learning for i in range(args.train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(batch) loss_pi.backward() pi_optimizer.step() # Value function learning for i in range(args.train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(batch) loss_v.backward() vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logs['kl'] += [kl] logs['ent'] += [ent] logs['loss_v'] += [loss_v.item()] logs['loss_pi'] += [loss_pi.item()] # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 ep_count = 0 # just for logging purpose, number of episodes run # Main loop: collect experience in env and update/log each epoch for epoch in range(args.epochs): for t in range(steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32).to(args.device)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) if ep_count % 100 == 0: frame = env.render(mode='rgb_array') # uncomment this line if you want to log to tensorboard (can be memory intensive) #gif_frames.append(frame) #gif_frames.append(PIL.Image.fromarray(frame).resize([64,64])) # you can try this downsize version if you are resource constrained time.sleep(0.01) # Update obs (critical!) o = next_o timeout = ep_len == args.max_ep_len terminal = d or timeout epoch_ended = t==steps_per_epoch-1 if terminal or epoch_ended: # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32).to(args.device)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logs['ep_ret'] += [ep_ret] logs['ep_len'] += [ep_len] ep_count += 1 o, ep_ret, ep_len = env.reset(), 0, 0 # save a video to tensorboard so you can view later if len(gif_frames) != 0: vid = np.stack(gif_frames) vid_tensor = vid.transpose(0,3,1,2)[None] writer.add_video('rollout', vid_tensor, epoch, fps=50) gif_frames = [] writer.flush() print('wrote video') # Perform VPG update! update() if epoch % 10 == 0: vals = {key: np.mean(val) for key, val in logs.items()} for key in vals: writer.add_scalar(key, vals[key], epoch) writer.flush() print('Epoch', epoch, vals) logs = defaultdict(lambda: [])
def run(self): #self.global_model=self.global_model.to(self.device) if(self.args.model_type == "LSTM"): self.AC=ActorCritic_LSTM() else: self.AC=ActorCritic() #optimizer_to(self.optimizer,self.device) env = create_env(self.world,self.stage) state=(env.reset()) #state=state.reshape(1,1,80,80) state=(state).to(self.device,dtype=torch.float) #state=self.imageProcess(state) i_epoch=self.epoch done=True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() h_0 = h_0.to(self.device) c_0 = c_0.to(self.device) Timestamp=50 for i in range((Timestamp)): env.render() p,value,h_0,c_0=self.AC(state,h_0,c_0) policy=F.softmax(p,dim=1) log_prob=F.log_softmax(p,dim=1) entropy=-(policy*log_prob).sum(1,keepdim=True) m=Categorical(policy) action=m.sample() next_state, reward, done, info = env.step(action.item()) #reward=reward/15 #next_state=next_state.view(1,1,80,80) next_state=(next_state).to(self.device,dtype=torch.float) #self.states.append(state) self.log_probs.append(log_prob[0,action]) self.rewards.append(reward) self.values.append(value) self.entropies.append(entropy) state=next_state if(done): state=(env.reset()) #state=state.reshape(1,1,80,80) state=state.to(self.device) #state=self.imageProcess(state) break """ actor_loss=0 critic_loss=0 returns=[] R=0 for reward in self.rewards[::-1]: R=reward+self.GAMMA*R returns.insert(0,R) """ #td=torch.tensor([1],dtype=torch.float).to(device) R = torch.zeros((1, 1), dtype=torch.float) if not done: _, R, _, _ = self.AC(state, h_0, c_0) R=R.to(self.device) actor_loss=0 critic_loss=0 entropy_loss=0 advantage=torch.zeros((1, 1), dtype=torch.float) advantage=advantage.to(self.device) next_value=R for log_prob,reward,value,entropy in list(zip(self.log_probs,self.rewards,self.values,self.entropies))[::-1]: advantage=advantage*self.GAMMA advantage=advantage+reward+self.GAMMA*next_value.detach()-value.detach() next_value=value actor_loss=actor_loss+(-log_prob*advantage) R=R*self.GAMMA+reward critic_loss=critic_loss+(R-value)**2/2 entropy_loss=entropy_loss+entropy total_loss=actor_loss+critic_loss-0.01*entropy_loss push_and_pull(self.optimizer, self.AC, self.global_model, total_loss) #for name, parms in self.C.named_parameters(): #print('-->name:', name, '-->grad_requirs:',parms.requires_grad,' -->grad_value:',parms.grad) if(i_epoch%10==0): print(self.name+"\ Episode %d \ Actor loss:%f \ Critic Loss:%f \ Total Loss: %f"%(i_epoch,actor_loss.item(),critic_loss.item(),total_loss.item())) """ y.append(critic_loss.item()) x.append(i_epoch) plt.plot(x,y) #畫線 plt.show() #顯示繪製的圖形 """ i_epoch+=1 del self.log_probs[:] del self.rewards[:] del self.values[:] del self.entropies[:] if(self.save): if(i_epoch%100==0): PATH='./model/{}/A3C_{}_{}.pkl'.format(self.level,self.level,self.args.model_type) torch.save({ 'epoch': i_epoch, 'model_state_dict': self.global_model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': total_loss, 'type':self.args.model_type, }, PATH) if(i_epoch==Max_epoch): return
def train(rank, args, shared_model, counter, lock, optimizer=None, device='cpu', select_sample=True): # torch.manual_seed(args.seed + rank) # logging log_dir = f'logs/{args.env_name}/{args.model_id}/{args.uuid}/' loss_logger = setup_logger('loss', log_dir, f'loss.log') # action_logger = setup_logger('actions', log_dir, f'actions.log') text_color = FontColor.RED if select_sample else FontColor.GREEN print( text_color + f"Process: {rank: 3d} | {'Sampling' if select_sample else 'Decision'} | Device: {str(device).upper()}", FontColor.END) env = create_atari_environment(args.env_name) observation_space = env.observation_space.shape[0] action_space = env.action_space.n # env.seed(args.seed + rank) model = ActorCritic(observation_space, action_space) if torch.cuda.is_available(): model = model.cuda() model.device = device if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True for t in count(start=args.start_step): if t % args.save_interval == 0 and t > 0: save_checkpoint(shared_model, optimizer, args, t) # Sync shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 512) hx = torch.zeros(1, 512) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] episode_length = 0 for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(-1, keepdim=True) entropies.append(entropy) reason = '' if select_sample: rand = random.random() epsilon = get_epsilon(t) if rand < epsilon and args.greedy_eps: action = torch.randint(0, action_space, (1, 1)) reason = 'uniform' else: action = prob.multinomial(1) reason = 'multinomial' else: action = prob.max(-1, keepdim=True)[1] reason = 'choice' # action_logger.info({ # 'rank': rank, # 'action': action.item(), # 'reason': reason, # }) if torch.cuda.is_available(): action = action.cuda() value = value.cuda() log_prob = log_prob.gather(-1, action) # action_out = ACTIONS[args.move_set][action.item()] state, reward, done, info = env.step(action.item()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 50), -50) # h/t @ArvindSoma with lock: counter.value += 1 if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.data values.append(R) loss = gae(R, rewards, values, log_probs, entropies, args) loss_logger.info({ 'episode': t, 'rank': rank, 'sampling': select_sample, 'loss': loss.item() }) optimizer.zero_grad() (loss).backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
from train import train_on_env from models import ActorCritic from constants import * model = ActorCritic(use_conv=False, input_size=4) if USE_CUDA: model.cuda() train_on_env("CartPole-v0", model, 5000, 128)
help='Clipping for PPO grad') parser.add_argument('--use-joint-pol-val', action='store_true', help='whether to use combined policy and value nets') args = parser.parse_args() env = gym.make(args.env_name) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] env.seed(args.seed) torch.manual_seed(args.seed) if args.use_joint_pol_val: ac_net = ActorCritic(num_inputs, num_actions) opt_ac = optim.Adam(ac_net.parameters(), lr=0.0003) else: policy_net = GRU(num_inputs, num_actions) old_policy_net = GRU(num_inputs, num_actions) value_net = Value(num_inputs) opt_policy = optim.Adam(policy_net.parameters(), lr=0.0003) opt_value = optim.Adam(value_net.parameters(), lr=0.0003) def create_batch_inputs(batch_states_list, batch_actions_list, batch_advantages_list, batch_targets_list): lengths = [] for states in batch_states_list: lengths.append(states.size(0))
#lr = 3e-4 #num_steps = 20 #mini_batch_size = 5 #ppo_epochs = 4 for c in range(num_classes): print("Learning Policy for class:", c) envs = [ make_env(num_features, blackbox_model, c, max_nodes, min_nodes) for i in range(num_envs) ] envs = SubprocVecEnv(envs) model = ActorCritic(num_features, embedding_size) optimizer = optim.Adam(model.parameters(), lr=lr) max_frames = 5000 frame_idx = 0 test_rewards = [] state = envs.reset() early_stop = False #Save mean rewards per episode env_0_mean_rewards = [] env_0_rewards = [] while frame_idx < max_frames and not early_stop:
action='store_true', help='force two leg together') parser.add_argument('--start-epoch', type=int, default=0, help='start-epoch') if __name__ == '__main__': args = parser.parse_args() os.environ['OMP_NUM_THREADS'] = '1' torch.manual_seed(args.seed) num_inputs = args.feature num_actions = 18 traffic_light = TrafficLight() counter = Counter() ac_net = ActorCritic(num_inputs, num_actions) opt_ac = optim.Adam(ac_net.parameters(), lr=args.lr) shared_grad_buffers = Shared_grad_buffers(ac_net) shared_obs_stats = Shared_obs_stats(num_inputs) if args.resume: print("=> loading checkpoint ") checkpoint = torch.load('../../7.87.t7') #checkpoint = torch.load('../../best.t7') args.start_epoch = checkpoint['epoch'] #best_prec1 = checkpoint['best_prec1'] ac_net.load_state_dict(checkpoint['state_dict']) opt_ac.load_state_dict(checkpoint['optimizer']) opt_ac.state = defaultdict(dict, opt_ac.state) #print(opt_ac)
def test(rank, args, shared_model, counter, device): # time.sleep(10.) # logging log_dir = f'logs/{args.env_name}/{args.model_id}/{args.uuid}/' info_logger = setup_logger('info', log_dir, f'info.log') result_logger = setup_logger('results', log_dir, f'results.log') # torch.manual_seed(args.seed + rank) env = create_atari_environment(args.env_name) if args.record: if not os.path.exists(f'playback/{args.env_name}/'): os.makedirs(f'playback/{args.env_name}/{args.model_id}', exist_ok=True) env = gym.wrappers.Monitor(env, f'playback/{args.env_name}/{args.model_id}/', force=True) # env.seed(args.seed + rank) observation_space = env.observation_space.shape[0] action_space = env.action_space.n model = ActorCritic(observation_space, action_space) if torch.cuda.is_available(): model.cuda() model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True episode_length = 0 actions = deque(maxlen=4000) start_time = time.time() for episode in count(): episode_length += 1 # shared model sync if done: model.load_state_dict(shared_model.state_dict()) cx = torch.zeros(1, 512) hx = torch.zeros(1, 512) else: cx = cx.data hx = hx.data with torch.no_grad(): value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) action = prob.max(-1, keepdim=True)[1] state, reward, done, info = env.step(action.item()) reward_sum += reward info_log = { 'id': args.model_id, 'algorithm': args.algorithm, 'greedy-eps': args.greedy_eps, 'episode': episode, 'total_episodes': counter.value, 'episode_length': episode_length, 'reward': reward_sum, 'done': done, } info_logger.info(info_log) print(f"{emojize(':video_game:', use_aliases=True)} | ", end='\r') env.render() actions.append(action.item()) if done: t = time.time() - start_time print( f"{emojize(':video_game:', use_aliases=True)} | " + \ f"ID: {args.model_id}, " + \ f"Total Episodes: {counter.value}, " + \ f"Time: {time.strftime('%H:%M:%S', time.gmtime(t)):^9s}, " + \ f"FPS: {episode_length/t: 6.2f}, " + \ f"Reward: {reward_sum: 10.0f}", end='\r', flush=True, ) result_logger.info(info_log) reward_sum = 0 episode_length = 0 actions.clear() time.sleep(args.reset_delay) state = env.reset() state = torch.from_numpy(state)
def train(rank, args, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = [0] * 41 last_v = [0] * 10 #last_state = numpy.zeros(48) env = RunEnv(visualize=False) #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) #running_state = ZFilter((num_inputs,), clip=5) start_time = time.time() for i_episode in range(args.start_epoch + 1, 999999): #print(shared_obs_stats.n[0]) #print('hei') #if rank == 0: # print(running_state.rs._n) signal_init = traffic_light.get() memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 #Tot_loss = 0 #Tot_num = while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) #state = numpy.array(state) last_state, last_v, state = process_observation( last_state, last_v, state) state = numpy.array(state) #state = running_state(state) state = Variable(torch.Tensor(state).unsqueeze(0)) shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) state = state.data[0].numpy() #print(state) #return #print(AA) #print(type(AA)) #print(type(state)) #print(AA.shape) #print(state.shape) reward_sum = 0 #timer = time.time() for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(state) print(action) print(ac_net.affine1.weight) print(ac_net.affine1.weight.data) print('ERROR') #action = select_action_actor_critic(state,ac_net) #action = action.data[0].numpy() #state = state + numpy.random.rand(args.feature)*0.001 raise RuntimeError('action NaN problem') #print(action) #print("------------------------") #timer = time.time() reward = 0 if args.skip: #env.step(action) _, A, _, _ = env.step(action) reward += A _, A, _, _ = env.step(action) reward += A BB = numpy.append(action, action) next_state, A, done, _ = env.step(BB) reward += A #print(next_state) #last_state = process_observation(state) last_state, last_v, next_state = process_observation( last_state, last_v, next_state) next_state = numpy.array(next_state) #print(next_state) #print(next_state.shape) #return reward_sum += reward #print('env:') #print(time.time()-timer) #last_state ,next_state = update_observation(last_state,next_state) #next_state = running_state(next_state) next_state = Variable(torch.Tensor(next_state).unsqueeze(0)) shared_obs_stats.observes(next_state) next_state = shared_obs_stats.normalize(next_state) next_state = next_state.data[0].numpy() #print(next_state[41:82]) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() #print('env:') #print(time.time()-timer) #timer = time.time() update_params_actor_critic(batch, args, ac_net, opt_ac) shared_grad_buffers.add_gradient(ac_net) counter.increment() epoch = i_episode if (i_episode % args.log_interval == 0) and (rank == 0): print( 'TrainEpisode {}\tTime{}\tLast reward: {}\tAverage reward {:.2f}' .format( i_episode, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), 'obs': shared_obs_stats, }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), 'obs': shared_obs_stats, }, PATH_TO_MODEL, epoch) # wait for a new signal to continue while traffic_light.get() == signal_init: pass
class ActorCriticAgentUsingICM: def __init__(self, nb_actions, learning_rate, gamma, hidden_size, model_input_size, entropy_coeff_start, entropy_coeff_end, entropy_coeff_anneal, continuous): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_actions = nb_actions self.gamma = gamma self.continuous = continuous self.learning_rate = learning_rate self.entropy_coefficient_start = entropy_coeff_start self.entropy_coefficient_end = entropy_coeff_end self.entropy_coefficient_anneal = entropy_coeff_anneal self.step_no = 0 if self.continuous: self.model = ActorCriticContinuous(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to( self.device) else: self.model = ActorCritic(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to(self.device) self.hidden_size = hidden_size self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) self.loss_function = torch.nn.MSELoss() self.memory = [] self.ICM = ICM(model_input_size, nb_actions) self.ICM.train() # Get the current entropy coefficient value according to the start/end and annealing values def get_entropy_coefficient(self): entropy = self.entropy_coefficient_end if self.step_no < self.entropy_coefficient_anneal: entropy = self.entropy_coefficient_start - self.step_no * \ ((self.entropy_coefficient_start - self.entropy_coefficient_end) / self.entropy_coefficient_anneal) return entropy # select an action with policy def select_action(self, state): self.step_no += 1 if self.continuous: action_mean, action_dev, state_value = self.model(state) action_dist = Normal(action_mean, action_dev) else: action_probs, state_value = self.model(state) action_dist = Categorical(action_probs) return action_dist, state_value def update_model(self): Gt = torch.tensor(0) policy_losses = [] forward_losses = [] inverse_losses = [] value_losses = [] entropy_loss = [] returns = [] # calculate the true value using rewards returned from the environment for (_, reward, _, _, _, _, _) in self.memory[::-1]: # calculate the discounted value Gt = reward + self.gamma * Gt returns.insert(0, Gt) returns = torch.tensor(returns) returns = (returns - returns.mean()) / (returns.std() + 1e-8) for (action_prob, _, state_value, entropy, state, next_state, action), Gt in zip(self.memory, returns): advantage = Gt.item() - state_value.item() # calculate actor (policy) loss policy_losses.append((-action_prob * advantage).mean()) # calculate critic (value) loss using model loss function value_losses.append( self.loss_function(state_value, Gt.unsqueeze(0))) entropy_loss.append(-entropy) forward_losses.append( self.ICM.get_forward_loss(state, action, next_state)) inverse_losses.append( self.ICM.get_inverse_loss(state, action, next_state)) # reset gradients self.optimizer.zero_grad() self.ICM.optimizer.zero_grad() # sum up all the values of policy_losses and value_losses icm_loss = (1 - self.ICM.beta) * torch.stack(inverse_losses).mean( ) + self.ICM.beta * torch.stack(forward_losses).mean() loss = self.ICM.lambda_weight*(torch.stack(policy_losses).mean() + \ torch.stack(value_losses).mean() + self.get_entropy_coefficient() * \ torch.stack(entropy_loss).mean()) + icm_loss loss.backward() self.optimizer.step() self.ICM.optimizer.step() self.memory = [] return loss.item() # save model def save(self, path, name): dirname = os.path.dirname(__file__) filename = os.path.join(dirname, os.path.join(path, name + ".pt")) torch.save(self.model.state_dict(), filename) # load a model def load(self, path): dirname = os.path.dirname(__file__) filename = os.path.join(dirname, path) self.model.load_state_dict(torch.load(filename)) def cache(self, action_prob, reward, state_value, entropy, state, next_state, action): self.memory.append((action_prob, reward, state_value, entropy, state, next_state, action))
return args if __name__=='__main__': args=get_args() LEVEL=str(args.world)+'-'+str(args.stage) folder='./model/{}'.format(LEVEL) if(not os.path.exists(folder)): os.mkdir(folder) if(args.model_type == "LSTM"): global_model=ActorCritic_LSTM() else: global_model=ActorCritic() global_model.to(device) optimizer=SharedAdam(global_model.parameters(),lr=1e-4) PATH='./model/{}/A3C_{}_{}.pkl'.format(LEVEL,LEVEL,args.model_type) epoch=1 if(args.load_model): if(os.path.exists(PATH)): print('Loaded Model') check_point=torch.load(PATH) global_model.load_state_dict(check_point['model_state_dict']) optimizer.load_state_dict(check_point['optimizer_state_dict']) epoch=check_point['epoch'] global_model.share_memory()
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number agent = ActorCritic(input_size=input_size, hidden_size=HIDDEN_SIZE, action_size=action_size) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() # here is to add readability new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'] new_log_probs = masses.log_prob( replay.action()).unsqueeze(-1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
import torch from torch import optim from tqdm import tqdm from env import Env from models import ActorCritic from utils import plot max_steps, batch_size, discount, trace_decay = 100000, 16, 0.99, 0.97 env = Env() agent = ActorCritic() actor_optimiser = optim.Adam(list(agent.actor.parameters()) + [agent.policy_log_std], lr=3e-4) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=1e-3) step, pbar = 0, tqdm(total=max_steps, smoothing=0) while step < max_steps: # Collect set of trajectories D by running policy π in the environment D = [[]] * batch_size for idx in range(batch_size): state, done, total_reward = env.reset(), False, 0 while not done: policy, value = agent(state) action = policy.sample() log_prob_action = policy.log_prob(action) next_state, reward, done = env.step(action) step += 1 pbar.update(1) total_reward += reward D[idx].append({'state': state, 'action': action, 'reward': torch.tensor([reward]), 'log_prob_action': log_prob_action, 'value': value}) state = next_state
def play(args): env = create_mario_env(args.env_name, ACTIONS[args.move_set]) observation_space = env.observation_space.shape[0] action_space = env.action_space.n model = ActorCritic(observation_space, action_space) checkpoint_file = \ f"{args.env_name}/{args.model_id}_{args.algorithm}_params.tar" checkpoint = restore_checkpoint(checkpoint_file) assert args.env_name == checkpoint['env'], \ "This checkpoint is for different environment: {checkpoint['env']}" args.model_id = checkpoint['id'] print(f"Environment: {args.env_name}") print(f" Agent: {args.model_id}") model.load_state_dict(checkpoint['model_state_dict']) state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True episode_length = 0 start_time = time.time() for step in count(): episode_length += 1 # shared model sync if done: cx = torch.zeros(1, 512) hx = torch.zeros(1, 512) else: cx = cx.data hx = hx.data with torch.no_grad(): value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) action = prob.max(-1, keepdim=True)[1] action_idx = action.item() action_out = ACTIONS[args.move_set][action_idx] state, reward, done, info = env.step(action_idx) reward_sum += reward print( f"{emojize(':mushroom:')} World {info['world']}-{info['stage']} | {emojize(':video_game:')}: [ {' + '.join(action_out):^13s} ] | ", end='\r', ) env.render() if done: t = time.time() - start_time print( f"{emojize(':mushroom:')} World {info['world']}-{info['stage']} |" + \ f" {emojize(':video_game:')}: [ {' + '.join(action_out):^13s} ] | " + \ f"ID: {args.model_id}, " + \ f"Time: {time.strftime('%H:%M:%S', time.gmtime(t)):^9s}, " + \ f"Reward: {reward_sum: 10.2f}, " + \ f"Progress: {(info['x_pos'] / 3225) * 100: 3.2f}%", end='\r', flush=True, ) reward_sum = 0 episode_length = 0 time.sleep(args.reset_delay) state = env.reset() state = torch.from_numpy(state)
def test(rank, args, shared_model, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = numpy.zeros(41) if args.render: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) state = numpy.array(state) #global last_state #last_state = state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning #print(t) #timer = time.time() if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(action) puts('ERROR') return #print('NN take:') #print(time.time()-timer) #print(action) #print("------------------------") #timer = time.time() if args.skip: #env.step(action) _, reward, _, _ = env.step(action) reward_sum += reward next_state, reward, done, _ = env.step(action) next_state = numpy.array(next_state) reward_sum += reward #print('env take:') #print(time.time()-timer) #timer = time.time() #last_state ,next_state = update_observation(last_state,next_state) next_state = running_state(next_state) #print(next_state[41:82]) mask = 1 if done: mask = 0 #print('update take:') #print(time.time()-timer) #timer = time.time() memory.push(state, np.array([action]), mask, next_state, reward) #print('memory take:') #print(time.time()-timer) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 #print(num_episodes) reward_batch += reward_sum #print(num_episodes) reward_batch /= num_episodes batch = memory.sample() #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac) time.sleep(60) if i_episode % args.log_interval == 0: File = open(PATH_TO_MODEL + '/record.txt', 'a+') File.write("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) File.close() #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format( # i_episode, reward_sum, reward_batch)) print("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) #print('!!!!') epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, epoch)
if args.resume: print("=> loading checkpoint ") checkpoint = torch.load('../models/ss/3.t7') #args.start_epoch = checkpoint['epoch'] #best_prec1 = checkpoint['best_prec1'] ac_net.load_state_dict(checkpoint['state_dict']) opt_ac.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch'])) else: if args.use_sep_pol_val: policy_net = Policy(num_inputs, num_actions) value_net = Value(num_inputs) opt_policy = optim.Adam(policy_net.parameters(), lr=args.lr) opt_value = optim.Adam(value_net.parameters(), lr=args.lr) else: ac_net = ActorCritic(num_inputs, num_actions) opt_ac = optim.Adam(ac_net.parameters(), lr=args.lr) def select_action(state): state = torch.from_numpy(state).unsqueeze(0) action_mean, _, action_std = policy_net(Variable(state)) action = torch.normal(action_mean, action_std) return action def select_action_actor_critic(state): state = torch.from_numpy(state).unsqueeze(0) action_mean, _, action_std, v = ac_net(Variable(state)) action = torch.normal(action_mean, action_std) return action