예제 #1
0
 def __init__(self, load_model=False, model_path=None):
     self.preprocessor = DTPytorchWrapper()
     self.model = DDPG(state_dim=self.preprocessor.shape,
                       action_dim=2,
                       max_action=1,
                       net_type="cnn")
     self.current_image = np.zeros((640, 480, 3))
예제 #2
0
    def __init__(self, load_model=False, model_path=None):
        self.preprocessor = DTPytorchWrapper()
        self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if load_model:
            fp = model_path if model_path else "model"
            self.model.load(fp, "models", for_inference=True)
예제 #3
0
    def __init__(self, obs_space, action_space, ram, writer, device, args):
        """
        :param obs_space: Dimensions of state (int)
        :param action_space: Dimension of action (int)
        :param ram: replay memory buffer object
        :return:
        """
        self.state_dim = obs_space.shape[0]
        self.action_dim = action_space.shape[0]
        self.action_high = action_space.high
        self.action_low = action_space.low
        self.ram = ram
        self.iter = 1
        self.steps = 0
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.decay_rate = args.decay_rate
        self.eps_start = args.eps_start
        self.eps_end = args.eps_end
        self.eps_decay = args.eps_decay
        self.start_step = args.start_learning
        self.device = device
        self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim)
        self.writer = writer
        self.args = args

        # init network
        target_net = DDPG(obs_space.shape, self.action_dim, args).to(device)
        learn_net = DDPG(obs_space.shape, self.action_dim, args).to(device)
        utils.hard_update(target_net, learn_net)
        self.AC = learn_net
        self.AC_T = target_net
        self.actor_optimizer = torch.optim.Adam(
            self.AC.actor.policyNet.parameters(), args.lr_a)
        self.critic_optimizer = torch.optim.Adam(self.AC.critic.parameters(),
                                                 args.lr_c)
        self.actor = self.AC.actor
        self.target_actor = self.AC_T.actor
        self.critic = self.AC.critic
        self.target_critic = self.AC_T.critic
    def init(self, context: Context):
        context.info('init()')

        self.image_processor = DTPytorchWrapper()
        self.action_processor = ActionWrapper(FakeWrap())
        from model import DDPG

        self.check_gpu_available(context)

        self.model = DDPG(state_dim=self.image_processor.shape, action_dim=2, max_action=1, net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))
        self.model.load("model", directory="./models")
예제 #5
0
    def __init__(self, load_model=False, model_path=None):
        logger.info('PytorchRLTemplateAgent init')
        self.preprocessor = DTPytorchWrapper()

        self.model = DDPG(state_dim=self.preprocessor.shape,
                          action_dim=2,
                          max_action=1,
                          net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if load_model:
            logger.info('PytorchRLTemplateAgent loading models')
            fp = model_path if model_path else "model"
            self.model.load(fp, "models", for_inference=True)
        logger.info('PytorchRLTemplateAgent init complete')
    def init(self, context: Context):
        self.check_gpu_available(context)
        logger.info("PytorchRLTemplateAgent init")
        from model import DDPG

        self.preprocessor = DTPytorchWrapper()

        self.model = DDPG(state_dim=self.preprocessor.shape,
                          action_dim=2,
                          max_action=1,
                          net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if self.load_model:
            logger.info("Pytorch Template Agent loading models")
            fp = self.model_path if self.model_path else "model"
            self.model.load(fp, "models", for_inference=True)
        logger.info("PytorchRLTemplateAgent init complete")
예제 #7
0
    def __init__(self, hparams):
        super(HER, self).__init__()

        self.hparams = hparams

        self.test_env = make_env(hparams, render=self.hparams.render_test)
        sample_obs = self.test_env.observation_space['observation'].sample()
        sample_goal = self.test_env.observation_space['achieved_goal'].sample()

        # HARD CODED VALUES FOR Bullet-HRL
        action_limits, state_limits = get_env_boundaries()
        action_offset, action_bounds, action_clip_low, action_clip_high = action_limits

        state_shape = sample_obs.shape[0]
        action_shape = self.test_env.action_space.shape[0]
        goal_shape = sample_goal.shape[0]
        self.action_clips = (action_clip_low, action_clip_high)

        self.model = DDPG(params=self.hparams,
                          obs_size=state_shape,
                          goal_size=goal_shape,
                          act_size=action_shape,
                          action_clips=(action_clip_low, action_clip_high),
                          action_bounds=action_bounds,
                          action_offset=action_offset)

        self.model.actor.share_memory()
        self.model.critic.share_memory()

        self.state_normalizer = Normalizer(
            state_shape, default_clip_range=self.hparams.clip_range)
        self.goal_normalizer = Normalizer(
            goal_shape, default_clip_range=self.hparams.clip_range)

        self.replay_buffer = SharedReplayBuffer(self.hparams.buffer_size,
                                                state_shape, action_shape,
                                                goal_shape)
예제 #8
0
def solve(params, cis):
    # python has dynamic typing, the line below can help IDEs with autocompletion
    assert isinstance(cis, ChallengeInterfaceSolution)
    # after this cis. will provide you with some autocompletion in some IDEs (e.g.: pycharm)
    cis.info('Creating model.')
    # you can have logging capabilties through the solution interface (cis).
    # the info you log can be retrieved from your submission files.

    # We get environment from the Evaluation Engine
    cis.info('Making environment')
    env = gym.make(params['env'])

    # === BEGIN SUBMISSION ===

    # If you created custom wrappers, you also need to copy them into this folder.

    from wrappers import NormalizeWrapper, ImgWrapper, ActionWrapper, ResizeWrapper

    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    # to make the images pytorch-conv-compatible
    env = ImgWrapper(env)
    env = ActionWrapper(env)

    # you ONLY need this wrapper if you trained your policy on [speed,steering angle]
    # instead [left speed, right speed]
    env = SteeringToWheelVelWrapper(env)

    # you have to make sure that you're wrapping at least the actions
    # and observations in the same as during training so that your model
    # receives the same kind of input, because that's what it's trained for
    # (for example if your model is trained on grayscale images and here
    # you _don't_ make it grayscale too, then your model wont work)

    # HERE YOU NEED TO CREATE THE POLICY NETWORK SAME AS YOU DID IN THE TRAINING CODE
    # if you aren't using the DDPG baseline code, then make sure to copy your model
    # into the model.py file and that it has a model.predict(state) method.
    from model import DDPG

    model = DDPG(state_dim=env.observation_space.shape,
                 action_dim=2,
                 max_action=1,
                 net_type="cnn")

    try:
        model.load("model", "models")

        # === END SUBMISSION ===

        # Then we make sure we have a connection with the environment and it is ready to go
        cis.info('Reset environment')
        observation = env.reset()

        # While there are no signal of completion (simulation done)
        # we run the predictions for a number of episodes, don't worry, we have the control on this part
        while True:
            # we passe the observation to our model, and we get an action in return
            action = model.predict(observation)
            # we tell the environment to perform this action and we get some info back in OpenAI Gym style
            observation, reward, done, info = env.step(action)
            # here you may want to compute some stats, like how much reward are you getting
            # notice, this reward may no be associated with the challenge score.

            # it is important to check for this flag, the Evalution Engine will let us know when should we finish
            # if we are not careful with this the Evaluation Engine will kill our container and we will get no score
            # from this submission
            if 'simulation_done' in info:
                cis.info('simulation_done received.')
                break
            if done:
                cis.info('Episode done; calling reset()')
                env.reset()

    finally:
        # release CPU/GPU resources, let's be friendly with other users that may need them
        cis.info('Releasing resources')
        try:
            model.close()
        except:
            msg = 'Could not call model.close():\n%s' % traceback.format_exc()
            cis.error(msg)
    cis.info('Graceful exit of solve()')
gamma = 0.99  # 用多少比例的 critic value來當作target q value
var = 3.0  # 動作搜索變異性

if __name__ == '__main__':
    # Create environment
    env = gym.make('Pendulum-v0').unwrapped
    n_state = env.observation_space.shape[0]  # 提取state的維度
    n_action = env.action_space.shape[0]  # 提取action的維度
    a_limit = env.action_space.high[0]  # 提取action連續動作中,最大的可能數值

    # Create network
    net = DDPG(n_state=n_state,
               n_action=n_action,
               a_limit=a_limit,
               model_folder=model_folder,
               memory_size=memory_size,
               batch_size=batch_size,
               tau=tau,
               gamma=gamma,
               var=var)
    net.load()

    # Train
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        for j in range(max_iter):
            # env.render()
            a = net.chooseAction(s)
            s_, r, finish, info = env.step(a)
예제 #10
0
def master_loop(env):

    logger = logging.getLogger()
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    fileHandler = logging.FileHandler('./log/test.log')
    fileHandler.setFormatter(formatter)

    logger.addHandler(fileHandler)
    logger.setLevel(logging.INFO)

    s_dim = env.get_s_dim()
    a_dim = env.get_a_dim()
    a_high = env.get_a_high()
    a_low = env.get_a_low()
    # print(a_bound)
    print("s_dim: {}, a_dim{}, a_high:{}, a_low:{}".format(
        s_dim, a_dim, a_high, a_low))
    ddpg = DDPG(a_dim,
                s_dim,
                a_high,
                a_low,
                lr_a=LR_A,
                lr_c=LR_C,
                gamma=GAMMA,
                tau=TAU,
                rpm_size=MEMORY_CAPACITY,
                batch_size=BATCH_SIZE)

    status = MPI.Status()
    start_time = time.time()
    reset_time = time.time()

    total_eps = 0
    total_step = 0

    n_step = 0
    n_eps = 0

    max_reward = -9999
    max_reward_rank = 0

    ddpg.load()

    while total_eps < MAX_EPISODES:
        data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
        source = status.Get_source()
        tag = status.Get_tag()

        if tag == REQ_ACTION:
            # action = env.action_space.sample()
            action = ddpg.choose_action(data)
            comm.send((action, total_eps, total_step),
                      dest=source,
                      tag=RSP_ACTION)

        elif tag == OBS_DATA:
            n_step += 1
            total_step += 1
            (s, a, r, s_, done, ep_reward, ep_step) = data
            is_done = 0.0
            if done:
                is_done = 1.0

            ddpg.store_transition(s, a, r, s_, is_done)

            if ddpg.pointer > LEARN_START and total_step % 3 == 0:
                ddpg.learn()

            if done:
                total_eps += 1
                if ep_reward > max_reward:
                    max_reward = ep_reward
                    max_reward_rank = source

                s = "eps: {:>8}, worker: {:>3}, ep_reward:{:7.4f}, max:{:7.4f}/{:>3}, step:{:4}".format(
                    total_eps, source, ep_reward, max_reward, max_reward_rank,
                    ep_step)
                #print(s)
                logging.info(s)

                if total_eps % 500 == 0:
                    ddpg.save(total_eps)
                    interval = time.time() - reset_time
                    s = "# total_step: {:>8} ,total_eps: {:>6} eps/min: {:>6}, frame/sec: {:>6}".format(
                        total_step, total_eps, n_eps / interval * 60,
                        n_step / interval)
                    #print(s)
                    logging.info(s)

                    n_step = 0
                    n_eps = 0
                    reset_time = time.time()
    if not os.path.exists(param_path):
        print("创建参数文件夹")
        os.makedirs(param_path)
    if not os.path.exists(log_path):
        print("创建日志文件夹")
        os.makedirs(log_path)
    env = make_env(env_name)
    obs_ls = env.reset()        # 初始化状态

    global_input_size = 0
    for cv in obs_ls:
        global_input_size += len(cv)
    for action_space in env.action_space:
        global_input_size += action_space.n
    # 初始化模型
    agent_models = [DDPG(str(i), len(obs_ls[i]),  env.action_space[i].n, global_input_size,  MEM_LEN, LEARNING_RATE) for i in range(len(env.world.agents))]
    target_models = [DDPG(str(i), len(obs_ls[i]),  env.action_space[i].n, global_input_size,  MEM_LEN, LEARNING_RATE) for i in range(len(env.world.agents))]
    for idx, model in enumerate(target_models):
        model.load_state_dict(agent_models[idx].state_dict())


    if LOAD_KEY:
        for idx, model in enumerate(agent_models):
            if idx == 0:
                check_point = torch.load('./param/DDPGagent0_listener_5000.pkl')
            else:
                check_point = torch.load('./param/DDPGagent1_listener_5m000.pkl')
            model.load_state_dict(check_point)

    for epo_i in range(MAX_EPOCH):
        obs_ls = env.reset()
예제 #12
0
max_iter = 200
model_folder = './model'
var = 0.0                   # 動作搜索變異性

if __name__ == '__main__':
    # Create environment
    env = gym.make('Pendulum-v0').unwrapped
    n_state = env.observation_space.shape[0]        # 提取state的維度
    n_action = env.action_space.shape[0]            # 提取action的維度
    a_limit = env.action_space.high[0]              # 提取action連續動作中,最大的可能數值

    # Create network
    net = DDPG(
        n_state = n_state,
        n_action = n_action,
        a_limit = a_limit,
        model_folder = model_folder,
        var = var
    )
    net.load()

    # Train
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        for j in range(max_iter):
            env.render()
            a = net.chooseAction(s)
            s_, r, finish, info = env.step(a)
예제 #13
0
            test_set.append(
                (user, list(test_user.loc[i:i + 9, 'itemId']),
                 test_user.loc[i + 10,
                               'itemId'], test_user.loc[i + 9, 'timestamp'] -
                 test_user.loc[i + 8, 'timestamp'],
                 float(test_user.loc[i + 10, 'reward']),
                 float(test_user.loc[i + 10, 'objective1']),
                 float(test_user.loc[i + 10, 'objective2'])))
train_set = train_set[:len(train_set) // batch_size * batch_size]
test_set = test_set[:len(test_set) // batch_size * batch_size]

start_time = time.time()

gpu_options = tf.GPUOptions(allow_growth=True)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    primary_network = DDPG(hidden_size, 'primary_network')
    target_network = DDPG(hidden_size, 'target_network')
    model = Reinforce_Model(user_count, item_count, hidden_size, batch_size,
                            primary_network, target_network)

    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())

    print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' %
          evaluate(sess, model, train_set))
    sys.stdout.flush()
    lr = 1
    start_time = time.time()
    last_auc = 0.0

    for epoch in range(100):