print('Model initialized') policy = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) total_steps = 1000 print('DDPG initiliazed') sample_buffer1 = deque(maxlen = 50000) sample_buffer2 = deque(maxlen = 50000) sample_buffer3 = deque(maxlen = 50000) for _ in range(total_steps): print ('Iteration :', _ + 1) print('Training policy on environment') samples = policy.learn_env(total_timesteps=env_timesteps,seed = seed,environment = environment,log_interval=10) env_timesteps = 1000 #change env_steps from next iteration onwards for z in samples[0]: sample_buffer1.append(z) for z in samples[1]: sample_buffer2.append(z) for z in samples[2]: sample_buffer3.append(z) loss = model.train_network(np.array(sample_buffer1),np.array(sample_buffer2),np.array(sample_buffer3),algorithm_id='DDPG', mini_batch_num=1000) print('Model train loss = ', loss)
env = gym.make(environment) env = DummyVecEnv([lambda: env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) n = model.learn_env(total_timesteps=total_timesteps, seed=seed, environment=environment) model.save(pathmodel) # np.save("swimmer_ddpg_reward.npy",n) del model # remove to demonstrate saving and loading print("Done...") model = DDPG.load(pathmodel) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()