def __init__(self, load_model=False, model_path=None): self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3))
def __init__(self, load_model=False, model_path=None): self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if load_model: fp = model_path if model_path else "model" self.model.load(fp, "models", for_inference=True)
def __init__(self, obs_space, action_space, ram, writer, device, args): """ :param obs_space: Dimensions of state (int) :param action_space: Dimension of action (int) :param ram: replay memory buffer object :return: """ self.state_dim = obs_space.shape[0] self.action_dim = action_space.shape[0] self.action_high = action_space.high self.action_low = action_space.low self.ram = ram self.iter = 1 self.steps = 0 self.gamma = args.gamma self.batch_size = args.batch_size self.tau = args.tau self.decay_rate = args.decay_rate self.eps_start = args.eps_start self.eps_end = args.eps_end self.eps_decay = args.eps_decay self.start_step = args.start_learning self.device = device self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim) self.writer = writer self.args = args # init network target_net = DDPG(obs_space.shape, self.action_dim, args).to(device) learn_net = DDPG(obs_space.shape, self.action_dim, args).to(device) utils.hard_update(target_net, learn_net) self.AC = learn_net self.AC_T = target_net self.actor_optimizer = torch.optim.Adam( self.AC.actor.policyNet.parameters(), args.lr_a) self.critic_optimizer = torch.optim.Adam(self.AC.critic.parameters(), args.lr_c) self.actor = self.AC.actor self.target_actor = self.AC_T.actor self.critic = self.AC.critic self.target_critic = self.AC_T.critic
def init(self, context: Context): context.info('init()') self.image_processor = DTPytorchWrapper() self.action_processor = ActionWrapper(FakeWrap()) from model import DDPG self.check_gpu_available(context) self.model = DDPG(state_dim=self.image_processor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) self.model.load("model", directory="./models")
def __init__(self, load_model=False, model_path=None): logger.info('PytorchRLTemplateAgent init') self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if load_model: logger.info('PytorchRLTemplateAgent loading models') fp = model_path if model_path else "model" self.model.load(fp, "models", for_inference=True) logger.info('PytorchRLTemplateAgent init complete')
def init(self, context: Context): self.check_gpu_available(context) logger.info("PytorchRLTemplateAgent init") from model import DDPG self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if self.load_model: logger.info("Pytorch Template Agent loading models") fp = self.model_path if self.model_path else "model" self.model.load(fp, "models", for_inference=True) logger.info("PytorchRLTemplateAgent init complete")
def __init__(self, hparams): super(HER, self).__init__() self.hparams = hparams self.test_env = make_env(hparams, render=self.hparams.render_test) sample_obs = self.test_env.observation_space['observation'].sample() sample_goal = self.test_env.observation_space['achieved_goal'].sample() # HARD CODED VALUES FOR Bullet-HRL action_limits, state_limits = get_env_boundaries() action_offset, action_bounds, action_clip_low, action_clip_high = action_limits state_shape = sample_obs.shape[0] action_shape = self.test_env.action_space.shape[0] goal_shape = sample_goal.shape[0] self.action_clips = (action_clip_low, action_clip_high) self.model = DDPG(params=self.hparams, obs_size=state_shape, goal_size=goal_shape, act_size=action_shape, action_clips=(action_clip_low, action_clip_high), action_bounds=action_bounds, action_offset=action_offset) self.model.actor.share_memory() self.model.critic.share_memory() self.state_normalizer = Normalizer( state_shape, default_clip_range=self.hparams.clip_range) self.goal_normalizer = Normalizer( goal_shape, default_clip_range=self.hparams.clip_range) self.replay_buffer = SharedReplayBuffer(self.hparams.buffer_size, state_shape, action_shape, goal_shape)
def solve(params, cis): # python has dynamic typing, the line below can help IDEs with autocompletion assert isinstance(cis, ChallengeInterfaceSolution) # after this cis. will provide you with some autocompletion in some IDEs (e.g.: pycharm) cis.info('Creating model.') # you can have logging capabilties through the solution interface (cis). # the info you log can be retrieved from your submission files. # We get environment from the Evaluation Engine cis.info('Making environment') env = gym.make(params['env']) # === BEGIN SUBMISSION === # If you created custom wrappers, you also need to copy them into this folder. from wrappers import NormalizeWrapper, ImgWrapper, ActionWrapper, ResizeWrapper env = ResizeWrapper(env) env = NormalizeWrapper(env) # to make the images pytorch-conv-compatible env = ImgWrapper(env) env = ActionWrapper(env) # you ONLY need this wrapper if you trained your policy on [speed,steering angle] # instead [left speed, right speed] env = SteeringToWheelVelWrapper(env) # you have to make sure that you're wrapping at least the actions # and observations in the same as during training so that your model # receives the same kind of input, because that's what it's trained for # (for example if your model is trained on grayscale images and here # you _don't_ make it grayscale too, then your model wont work) # HERE YOU NEED TO CREATE THE POLICY NETWORK SAME AS YOU DID IN THE TRAINING CODE # if you aren't using the DDPG baseline code, then make sure to copy your model # into the model.py file and that it has a model.predict(state) method. from model import DDPG model = DDPG(state_dim=env.observation_space.shape, action_dim=2, max_action=1, net_type="cnn") try: model.load("model", "models") # === END SUBMISSION === # Then we make sure we have a connection with the environment and it is ready to go cis.info('Reset environment') observation = env.reset() # While there are no signal of completion (simulation done) # we run the predictions for a number of episodes, don't worry, we have the control on this part while True: # we passe the observation to our model, and we get an action in return action = model.predict(observation) # we tell the environment to perform this action and we get some info back in OpenAI Gym style observation, reward, done, info = env.step(action) # here you may want to compute some stats, like how much reward are you getting # notice, this reward may no be associated with the challenge score. # it is important to check for this flag, the Evalution Engine will let us know when should we finish # if we are not careful with this the Evaluation Engine will kill our container and we will get no score # from this submission if 'simulation_done' in info: cis.info('simulation_done received.') break if done: cis.info('Episode done; calling reset()') env.reset() finally: # release CPU/GPU resources, let's be friendly with other users that may need them cis.info('Releasing resources') try: model.close() except: msg = 'Could not call model.close():\n%s' % traceback.format_exc() cis.error(msg) cis.info('Graceful exit of solve()')
gamma = 0.99 # 用多少比例的 critic value來當作target q value var = 3.0 # 動作搜索變異性 if __name__ == '__main__': # Create environment env = gym.make('Pendulum-v0').unwrapped n_state = env.observation_space.shape[0] # 提取state的維度 n_action = env.action_space.shape[0] # 提取action的維度 a_limit = env.action_space.high[0] # 提取action連續動作中,最大的可能數值 # Create network net = DDPG(n_state=n_state, n_action=n_action, a_limit=a_limit, model_folder=model_folder, memory_size=memory_size, batch_size=batch_size, tau=tau, gamma=gamma, var=var) net.load() # Train reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 for j in range(max_iter): # env.render() a = net.chooseAction(s) s_, r, finish, info = env.step(a)
def master_loop(env): logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fileHandler = logging.FileHandler('./log/test.log') fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) logger.setLevel(logging.INFO) s_dim = env.get_s_dim() a_dim = env.get_a_dim() a_high = env.get_a_high() a_low = env.get_a_low() # print(a_bound) print("s_dim: {}, a_dim{}, a_high:{}, a_low:{}".format( s_dim, a_dim, a_high, a_low)) ddpg = DDPG(a_dim, s_dim, a_high, a_low, lr_a=LR_A, lr_c=LR_C, gamma=GAMMA, tau=TAU, rpm_size=MEMORY_CAPACITY, batch_size=BATCH_SIZE) status = MPI.Status() start_time = time.time() reset_time = time.time() total_eps = 0 total_step = 0 n_step = 0 n_eps = 0 max_reward = -9999 max_reward_rank = 0 ddpg.load() while total_eps < MAX_EPISODES: data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() tag = status.Get_tag() if tag == REQ_ACTION: # action = env.action_space.sample() action = ddpg.choose_action(data) comm.send((action, total_eps, total_step), dest=source, tag=RSP_ACTION) elif tag == OBS_DATA: n_step += 1 total_step += 1 (s, a, r, s_, done, ep_reward, ep_step) = data is_done = 0.0 if done: is_done = 1.0 ddpg.store_transition(s, a, r, s_, is_done) if ddpg.pointer > LEARN_START and total_step % 3 == 0: ddpg.learn() if done: total_eps += 1 if ep_reward > max_reward: max_reward = ep_reward max_reward_rank = source s = "eps: {:>8}, worker: {:>3}, ep_reward:{:7.4f}, max:{:7.4f}/{:>3}, step:{:4}".format( total_eps, source, ep_reward, max_reward, max_reward_rank, ep_step) #print(s) logging.info(s) if total_eps % 500 == 0: ddpg.save(total_eps) interval = time.time() - reset_time s = "# total_step: {:>8} ,total_eps: {:>6} eps/min: {:>6}, frame/sec: {:>6}".format( total_step, total_eps, n_eps / interval * 60, n_step / interval) #print(s) logging.info(s) n_step = 0 n_eps = 0 reset_time = time.time()
if not os.path.exists(param_path): print("创建参数文件夹") os.makedirs(param_path) if not os.path.exists(log_path): print("创建日志文件夹") os.makedirs(log_path) env = make_env(env_name) obs_ls = env.reset() # 初始化状态 global_input_size = 0 for cv in obs_ls: global_input_size += len(cv) for action_space in env.action_space: global_input_size += action_space.n # 初始化模型 agent_models = [DDPG(str(i), len(obs_ls[i]), env.action_space[i].n, global_input_size, MEM_LEN, LEARNING_RATE) for i in range(len(env.world.agents))] target_models = [DDPG(str(i), len(obs_ls[i]), env.action_space[i].n, global_input_size, MEM_LEN, LEARNING_RATE) for i in range(len(env.world.agents))] for idx, model in enumerate(target_models): model.load_state_dict(agent_models[idx].state_dict()) if LOAD_KEY: for idx, model in enumerate(agent_models): if idx == 0: check_point = torch.load('./param/DDPGagent0_listener_5000.pkl') else: check_point = torch.load('./param/DDPGagent1_listener_5m000.pkl') model.load_state_dict(check_point) for epo_i in range(MAX_EPOCH): obs_ls = env.reset()
max_iter = 200 model_folder = './model' var = 0.0 # 動作搜索變異性 if __name__ == '__main__': # Create environment env = gym.make('Pendulum-v0').unwrapped n_state = env.observation_space.shape[0] # 提取state的維度 n_action = env.action_space.shape[0] # 提取action的維度 a_limit = env.action_space.high[0] # 提取action連續動作中,最大的可能數值 # Create network net = DDPG( n_state = n_state, n_action = n_action, a_limit = a_limit, model_folder = model_folder, var = var ) net.load() # Train reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 for j in range(max_iter): env.render() a = net.chooseAction(s) s_, r, finish, info = env.step(a)
test_set.append( (user, list(test_user.loc[i:i + 9, 'itemId']), test_user.loc[i + 10, 'itemId'], test_user.loc[i + 9, 'timestamp'] - test_user.loc[i + 8, 'timestamp'], float(test_user.loc[i + 10, 'reward']), float(test_user.loc[i + 10, 'objective1']), float(test_user.loc[i + 10, 'objective2']))) train_set = train_set[:len(train_set) // batch_size * batch_size] test_set = test_set[:len(test_set) // batch_size * batch_size] start_time = time.time() gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: primary_network = DDPG(hidden_size, 'primary_network') target_network = DDPG(hidden_size, 'target_network') model = Reinforce_Model(user_count, item_count, hidden_size, batch_size, primary_network, target_network) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' % evaluate(sess, model, train_set)) sys.stdout.flush() lr = 1 start_time = time.time() last_auc = 0.0 for epoch in range(100):