示例#1
0
 def __init__(self, config, game_name,  env):
     Reinforce_Suite.__init__(self, config, model, env)
     self.config = config
     self.game_name = game_name
     self.model = Perceptron(self.game_name, None, self.config, "MSE", direct = True)
     self.policy_gen = GAN_for_Policy(self.game_name, None, self.config, "MSE")
     self.policy_reserve = Policy(self.model, self.config.episilon)
     self.replay_match = config.replay_match
     self.obs = []
     self.replay_size = config.replay_size
     self.pre = config.pre
     self.sample_size = config.sample_size
     self.epoch = config.epoch
     self.train_epoch = config.train_epoch
示例#2
0
 def __init__(self, config, game_name, env):
     Reinforce_Suite.__init__(self, config, model, env)
     self.config = config
     self.game_name = game_name
     self.model = Perceptron(self.game_name,
                             None,
                             self.config,
                             "MSE",
                             direct=True)
     self.reg_model = AutoDecoder(self.game_name, None, self.config, "MSE")
     self.policy_reserve = Policy(self.model, self.config.episilon)
     self.replay_match = config.replay_match
     self.obs = []
     self.replay_size = config.replay_size
     self.pre = config.pre
     self.reg_epoch = config.reg_epoch
示例#3
0
文件: MC_RL.py 项目: swtheing/ADRL
 def __init__(self, config, game_name, env):
     if config.model == "DNN":
         Actor = Perceptron(game_name, None, config, "MSE")
         self.conti_act = False
         self.multi_act = False
     elif config.model == "Gaussian":
         Actor = Gaussian(game_name, None, config, None)
         self.conti_act = True
         self.multi_act = False
     elif config.model == "CNN":
         Actor = Cnn(game_name, None, config, "MSE")
         self.conti_act = False
         self.multi_act = False
     elif config.model == "TranPtr":
         Actor = Trans_Ptr(game_name, None, config, "MSE")
         self.conti_act = False
         self.multi_act = True
     Reinforce_Suite.__init__(self, config, Actor, env)
     self.replay_match = config.replay_match
     self.replay_size = config.replay_size
     self.observe_id = config.observe_id
     self.on_policy = config.on_policy
     self.replay_switch = config.replay_switch
     self.replay_obs = []
     self.replay_act = []
     self.replay_rew = []
     self.replay_done = []
     self.replay_next = []
     self.replay_Q = []
     self.base_v = 0.0
     self.sum_step = 0
     self.viewer = None
示例#4
0
 def __init__(self, config, game_name, env):
     Actor = Perceptron(game_name, None, config, "CE", attribute="Actor")
     Reinforce_Suite.__init__(self, config, Actor, env)
     self.Critic = Perceptron(game_name,
                              None,
                              config,
                              "MSE",
                              attribute="Critic")
     self.replay_match = config.replay_match
     self.replay_size = config.replay_size
     self.observe_id = config.observe_id
     self.on_policy = config.on_policy
     self.replay_switch = config.replay_switch
     self.replay_obs = []
     self.replay_act = []
     self.replay_rew = []
     self.replay_done = []
     self.replay_next = []
     self.viewer = None
示例#5
0
 def __init__(self, config, game_name, env):
     if config.model == "DNN":
         model_reserve = Perceptron(game_name, None, config, "MSE")
         model_predict = Perceptron(game_name, None, config, "MSE",
                                    "predict", model_reserve)
     else:
         model_reserve = Cnn(game_name, None, config, "MSE")
         model_predict = Cnn(game_name, None, config, "MSE", "predict",
                             model_reserve)
     Reinforce_Suite.__init__(self, config, model_reserve, env,
                              model_predict)
     self.replay_match = config.replay_match
     self.replay_size = config.replay_size
     self.observe_id = config.observe_id
     self.replay_obs = []
     self.replay_act = []
     self.replay_rew = []
     self.replay_done = []
     self.replay_next = []
     self.viewer = None
示例#6
0
文件: DQN.py 项目: swtheing/ADRL
 def __init__(self, config, game_name,  env):
     if config.model == "DNN":
         model = Perceptron(game_name, None, config, "MSE")
     else:
         model = Cnn(game_name, None, config, "MSE")
     Reinforce_Suite.__init__(self, config, model, env)
     self.replay_match = config.replay_match
     self.replay_size = config.replay_size
     self.observe_id = config.observe_id
     self.replay_obs = []
     self.replay_act = []
     self.replay_rew = []
     self.replay_done = []
     self.replay_next = []
     self.replay_s = []
     self.replay_ns = []
     self.debug = config.debug
     self.viewer = None
     self.sample_len = 0
示例#7
0
class ActorCritic(Reinforce_Suite):
    def __init__(self, config, game_name, env):
        Actor = Perceptron(game_name, None, config, "CE", attribute="Actor")
        Reinforce_Suite.__init__(self, config, Actor, env)
        self.Critic = Perceptron(game_name,
                                 None,
                                 config,
                                 "MSE",
                                 attribute="Critic")
        self.replay_match = config.replay_match
        self.replay_size = config.replay_size
        self.observe_id = config.observe_id
        self.on_policy = config.on_policy
        self.replay_switch = config.replay_switch
        self.replay_obs = []
        self.replay_act = []
        self.replay_rew = []
        self.replay_done = []
        self.replay_next = []
        self.viewer = None

    def Gen_Batch_Data(self, policy, epoch_num):
        batchs = []
        for epoch in range(epoch_num):
            samples = random.sample(range(len(self.replay_obs)),
                                    self.model.batch_size)
            samples_obs = [self.replay_obs[i] for i in samples]
            samples_act = [self.replay_act[i] - 1 for i in samples]
            samples_epr = []
            samples_Q = []
            for i in samples:
                if self.replay_done[i]:
                    samples_Q.append(self.replay_rew[i])
                    samples_epr.append(self.replay_rew[i])
                else:
                    p, Q = self.Critic.test_model([self.replay_obs[i]])
                    if self.on_policy:
                        samples_Q.append(self.replay_rew[i] + self.gamma *
                                         Q[0, self.replay_act[i + 1] - 1])
                    else:
                        samples_Q.append(self.replay_rew[i] +
                                         self.gamma * np.max(Q))
                    samples_epr.append(Q[0, self.replay_act[i] - 1])
            #print samples_Q
            tup = (samples_obs, samples_act, samples_epr, samples_Q)

            batchs.append(tup)
        return batchs

    def Get_Data(self, policy):
        observation = self.env.reset()
        match = 0
        over_reward = 0
        max_reward = -21.0
        min_reward = 0
        match_rerward = 0.0
        show_flag = 1

        if not self.replay_switch:
            self.replay_obs = []
            self.replay_act = []
            self.replay_rew = []
            self.replay_done = []
            self.replay_next = []

        if len(self.replay_obs) == self.replay_size:
            del self.replay_obs[0]
            del self.replay_done[0]
            del self.replay_next[0]
            del self.replay_rew[0]
            del self.replay_act[0]

        self.replay_obs.append(observation)
        while True:
            action, Q, Q_debug = policy.action_sel(observation, max_sel=False)
            if len(self.replay_obs) > self.replay_size:
                del self.replay_obs[0]
                del self.replay_done[0]
                del self.replay_next[0]
                del self.replay_rew[0]
                del self.replay_act[0]
            #self.env.render()
            #replay strategy
            # if self.observe_id < len(self.replay_obs):
            #     self.observe_picture = self.replay_obs[self.observe_id][25:,:,:]
            #     if (observation[25:,:,:] == self.observe_picture).all():
            #         if self.viewer is None:
            #             self.viewer = rendering.SimpleImageViewer()
            #         if show_flag == 1:
            #             self.viewer.imshow(observation[25:,:,:])
            #             show_flag = 0
            #         print "observe id: {}, action: {}, Q: {}".format(self.observe_id, action, Q_debug)
            #raw_input("trace image is here (Enter go): ");
            observation, reward, done, info = self.env.step(action)
            self.replay_rew.append(reward)
            self.replay_done.append(done)
            self.replay_act.append(action)
            if not done:
                over_reward += reward
                match_rerward += reward
                self.replay_next.append(observation)
                self.replay_obs.append(observation)
            else:
                if match_rerward > max_reward:
                    max_reward = match_rerward
                elif match_rerward < min_reward:
                    min_reward = match_rerward
                match_rerward = 0
                self.replay_next.append(observation)
                match += 1
                if match == self.replay_match:
                    return over_reward / self.replay_match, max_reward, min_reward
                observation = self.env.reset()
                self.replay_obs.append(observation)

    def Train_Data(self, policy, train_epoch, train_data):
        #samples = self.random_sampling()
        #print [self.replay_Q[i] for i in samples]
        #print "sample ok"
        #print len(self.replay_obs)
        #print len(self.replay_act)
        #print len(self.replay_rew)
        #print len(self.replay_next)
        #print self.replay_Q
        samples_obs, samples_act, samples_epr, samples_Q = train_data
        self.Train_Critic(train_data, train_epoch)
        #print samples_epr
        policy.model.train_model(samples_obs, samples_act, samples_epr,
                                 samples_Q, train_epoch)

    def Train_Critic(self, train_data, train_epoch):
        #print "Train Critic"
        samples_obs, samples_act, samples_epr, samples_Q = train_data
        #print samples_Q
        samples_epr = [1.0 for i in range(len(samples_epr))]
        self.Critic.train_model(samples_obs, samples_act, samples_epr,
                                samples_Q, train_epoch)
示例#8
0
class DPS(Reinforce_Suite):
    def __init__(self, config, game_name, env):
        Reinforce_Suite.__init__(self, config, model, env)
        self.config = config
        self.game_name = game_name
        self.model = Perceptron(self.game_name,
                                None,
                                self.config,
                                "MSE",
                                direct=True)
        self.reg_model = AutoDecoder(self.game_name, None, self.config, "MSE")
        self.policy_reserve = Policy(self.model, self.config.episilon)
        self.replay_match = config.replay_match
        self.obs = []
        self.replay_size = config.replay_size
        self.pre = config.pre
        self.reg_epoch = config.reg_epoch

    def Pre_Reg(self, policy):
        if len(self.obs) == 0:
            mean_reward, max_reward, min_reward = self.Get_Data(
                self.policy_reserve)
        obs_batch = self.Gen_Batch_Data(self.policy_reserve, self.reg_epoch)
        for i in range(self.reg_epoch):
            self.reg_model.train_model(obs_batch[i], None, None, None, i)
        return self.reg_model.get_w()

    def Policy_Search(self):
        def ackley(solution):
            value = []
            if self.pre:
                w1 = self.Pre_Reg(self.policy_reserve)
            w_1_dim = (self.config.feature_size, self.config.hidden_size)
            w_2_dim = (self.config.hidden_size, self.config.action_size)
            w_flat = solution.get_x()
            if not self.pre:
                value.append(
                    np.reshape(w_flat[0:w_1_dim[0] * w_1_dim[1]], w_1_dim))
                value.append(
                    np.reshape(w_flat[w_1_dim[0] * w_1_dim[1]:], w_2_dim))
            else:
                value.append(w1)
                value.append(np.reshape(w_flat, w_2_dim))
            self.model._assign(value)
            self.model.train_model(None, None, None, None, None)
            mean_reward, max_reward, min_reward = self.Get_Data(
                self.policy_reserve)
            print "eval max_reward: {}, min_reward: {}, mean_reward: {}".format(
                max_reward, min_reward, mean_reward)
            return -mean_reward

        if not self.pre:
            dim = self.config.feature_size * self.config.hidden_size + self.config.hidden_size * self.config.action_size
        else:
            dim = self.config.hidden_size * self.config.action_size
        obj = Objective(ackley,
                        Dimension(dim, [[-0.01, 0.01]] * dim, [True] * dim))
        solution = Opt.min(
            obj,
            Parameter(budget=100 * dim,
                      uncertain_bits=100,
                      intermediate_result=False,
                      intermediate_freq=1))
        solution.print_solution()

    def Get_Data(self, policy):
        observation = self.env.reset()
        if len(self.obs) == self.replay_size:
            del self.obs[0]
        self.obs.append(observation)
        match = 0
        over_reward = 0
        max_reward = -21.0
        min_reward = 0
        match_rerward = 0.0
        observation_batch = [observation]
        while True:
            #self.env.render()
            action, Q, Q_debug = policy.action_sel(observation_batch)
            observation, reward, done, info = self.env.step(action)
            if len(self.obs) == self.replay_size:
                del self.obs[0]
            self.obs.append(observation)
            if not done:
                over_reward += reward
                match_rerward += reward
                observation_batch = [observation]
            else:
                if match_rerward > max_reward:
                    max_reward = match_rerward
                elif match_rerward < min_reward:
                    min_reward = match_rerward
                match_rerward = 0
                match += 1
                if match == self.replay_match:
                    return over_reward / self.replay_match, max_reward, min_reward
                observation = self.env.reset()
                observation_batch = [observation]

    def Gen_Batch_Data(self, policy, epoch_num):
        batchs = []
        for epoch in range(epoch_num):
            samples = random.sample(range(len(self.obs)),
                                    self.reg_model.batch_size)
            samples_obs = [self.obs[i] for i in samples]
            batchs.append(samples_obs)
        return batchs
            #strip:去掉每行字符串首尾指定的字符(默认空格或换行符)
            #split:按照指定的字符将字符串切割成每个字段,返回列表形式
            curLine = line.strip().split(',')
            #将每行中除标记外的数据放入数据集中(curLine[0]为标记信息)
            #在放入的同时将原先字符串形式的数据转换为整型
            #此外将数据进行了二值化处理,大于128的转换成1,小于的转换成1,方便后续计算
            dataArr.append([int(int(num) > 128) for num in curLine[1:]])
            #将标记信息放入标记集中
            #放入的同时将标记转换为整型
            labelArr.append(1 if int(curLine[0]) == 1 else -1)
    #返回数据集和标记
    return np.array(dataArr), np.array(labelArr)


if __name__ == '__main__':
    # 读取数据
    data, labels = load_data()
    # 打乱数据并分割数据集
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=10)
    # 创建感知机分类器对象
    model = Perceptron(len(x_train[0]))
    # 训练感知机模型并开始训练
    train_acc = model.train(x_train, y_train)
    print('train acc:' + str(train_acc))
    # 对测试集进行测试
    test_acc = model.test(x_test, y_test)
    print('test acc:' + str(test_acc))
示例#10
0
class Policy_Generator(Reinforce_Suite):
    def __init__(self, config, game_name,  env):
        Reinforce_Suite.__init__(self, config, model, env)
        self.config = config
        self.game_name = game_name
        self.model = Perceptron(self.game_name, None, self.config, "MSE", direct = True)
        self.policy_gen = GAN_for_Policy(self.game_name, None, self.config, "MSE")
        self.policy_reserve = Policy(self.model, self.config.episilon)
        self.replay_match = config.replay_match
        self.obs = []
        self.replay_size = config.replay_size
        self.pre = config.pre
        self.sample_size = config.sample_size
        self.epoch = config.epoch
        self.train_epoch = config.train_epoch

    def Get_Data(self, policy):
        observation = self.env.reset()
        if len(self.obs) == self.replay_size:
            del self.obs[0]
        self.obs.append(observation)
        match = 0
        over_reward = 0
        max_reward = -21.0
        min_reward = 0
        match_rerward = 0.0
        observation_batch = [observation]
        while True:
            #self.env.render()
            action, Q, Q_debug = policy.action_sel(observation_batch)
            observation, reward, done, info = self.env.step(action)
            if len(self.obs) == self.replay_size:
                del self.obs[0]
            self.obs.append(observation)
            if not done:
                over_reward += reward
                match_rerward += reward
                observation_batch = [observation]
            else:
                if match_rerward > max_reward:
                    max_reward = match_rerward
                elif match_rerward < min_reward:
                    min_reward = match_rerward
                match_rerward = 0
                match += 1
                if match == self.replay_match:
                    return over_reward / self.replay_match, max_reward, min_reward
                observation = self.env.reset()
                observation_batch = [observation]

    def Policy_Search(self):
        for epoch in range(self.epoch):
            min_score, min_W, max_score, max_W, mean_score = self.Policy_Opt(epoch)
            print "epoch {}, min_socre: {}, max_score: {}, mean_score: {}".format(self.epoch, min_score, max_score, mean_score)
    
    def Policy_Opt(self, epoch):
        def ackley(w_flat):
            value = []
            w_1_dim = (self.config.feature_size, self.config.hidden_size)
            w_2_dim = (self.config.hidden_size, self.config.action_size)
            value.append(np.reshape(w_flat[0:w_1_dim[0] * w_1_dim[1]], w_1_dim))
            value.append(np.reshape(w_flat[w_1_dim[0] * w_1_dim[1]:], w_2_dim))
            self.model._assign(value)
            self.model.train_model(None, None, None, None, None)
            mean_reward, max_reward, min_reward = self.Get_Data(self.policy_reserve)
            print "eval max_reward: {}, min_reward: {}, mean_reward: {}".format(max_reward, min_reward, mean_reward)
            return mean_reward
        z_sample = self.policy_gen._noise_gen(self.sample_size)
        feed = {self.policy_gen.tf_noise: z_sample}
        W_fake = self.policy_gen.sess.run([self.policy_gen.W_fake], feed_dict=feed)
        count = len(self.policy_gen.w_samples.keys())
        min_score = 20.0
        max_score = -20.0
        mean_score = 0.0
        for i in range(self.sample_size):
            score = ackley(W_fake[0][i])
            if min_score > score:
                min_score = score
                min_W = W_fake[0][i]
            if max_score < score:
                max_score = score
                max_W = W_fake[0][i]
            mean_score += score / self.sample_size
            self.policy_gen.w_samples[count] = W_fake[0][i]
            self.policy_gen.w_scores[count] = score
            count += 1
        self.sort_sample = sorted(self.policy_gen.w_scores.items(), key=lambda x: x[1])
        pos_batches = []
        for i in range(self.policy_gen.batch_size):
            pos_batches.append(self.policy_gen.w_samples[self.sort_sample[i][0]])
        for iter in range(self.train_epoch):
            samples = random.sample(range(len(self.policy_gen.w_samples) - self.policy_gen.batch_size), self.policy_gen.batch_size)
            z_sample = self.policy_gen._noise_gen(2 * self.policy_gen.batch_size)
            neg_batches = []
            for i in samples:
                neg_batches.append(self.policy_gen.w_samples[self.sort_sample[i + self.policy_gen.batch_size][0]])
            pos_labels = np.ones(self.policy_gen.batch_size)
            neg_labels = np.zeros(self.policy_gen.batch_size)
            labels = np.reshape(np.concatenate([pos_labels, neg_labels], axis=0), [2 * self.policy_gen.batch_size, -1])
            data = (z_sample, pos_batches + neg_batches, labels)
            self.policy_gen.train_model(data, None, None, None, iter)

        return min_score, min_W, max_score, max_W, mean_score