def __init__(self, config, game_name, env): Reinforce_Suite.__init__(self, config, model, env) self.config = config self.game_name = game_name self.model = Perceptron(self.game_name, None, self.config, "MSE", direct = True) self.policy_gen = GAN_for_Policy(self.game_name, None, self.config, "MSE") self.policy_reserve = Policy(self.model, self.config.episilon) self.replay_match = config.replay_match self.obs = [] self.replay_size = config.replay_size self.pre = config.pre self.sample_size = config.sample_size self.epoch = config.epoch self.train_epoch = config.train_epoch
def __init__(self, config, game_name, env): Reinforce_Suite.__init__(self, config, model, env) self.config = config self.game_name = game_name self.model = Perceptron(self.game_name, None, self.config, "MSE", direct=True) self.reg_model = AutoDecoder(self.game_name, None, self.config, "MSE") self.policy_reserve = Policy(self.model, self.config.episilon) self.replay_match = config.replay_match self.obs = [] self.replay_size = config.replay_size self.pre = config.pre self.reg_epoch = config.reg_epoch
def __init__(self, config, game_name, env): if config.model == "DNN": Actor = Perceptron(game_name, None, config, "MSE") self.conti_act = False self.multi_act = False elif config.model == "Gaussian": Actor = Gaussian(game_name, None, config, None) self.conti_act = True self.multi_act = False elif config.model == "CNN": Actor = Cnn(game_name, None, config, "MSE") self.conti_act = False self.multi_act = False elif config.model == "TranPtr": Actor = Trans_Ptr(game_name, None, config, "MSE") self.conti_act = False self.multi_act = True Reinforce_Suite.__init__(self, config, Actor, env) self.replay_match = config.replay_match self.replay_size = config.replay_size self.observe_id = config.observe_id self.on_policy = config.on_policy self.replay_switch = config.replay_switch self.replay_obs = [] self.replay_act = [] self.replay_rew = [] self.replay_done = [] self.replay_next = [] self.replay_Q = [] self.base_v = 0.0 self.sum_step = 0 self.viewer = None
def __init__(self, config, game_name, env): Actor = Perceptron(game_name, None, config, "CE", attribute="Actor") Reinforce_Suite.__init__(self, config, Actor, env) self.Critic = Perceptron(game_name, None, config, "MSE", attribute="Critic") self.replay_match = config.replay_match self.replay_size = config.replay_size self.observe_id = config.observe_id self.on_policy = config.on_policy self.replay_switch = config.replay_switch self.replay_obs = [] self.replay_act = [] self.replay_rew = [] self.replay_done = [] self.replay_next = [] self.viewer = None
def __init__(self, config, game_name, env): if config.model == "DNN": model_reserve = Perceptron(game_name, None, config, "MSE") model_predict = Perceptron(game_name, None, config, "MSE", "predict", model_reserve) else: model_reserve = Cnn(game_name, None, config, "MSE") model_predict = Cnn(game_name, None, config, "MSE", "predict", model_reserve) Reinforce_Suite.__init__(self, config, model_reserve, env, model_predict) self.replay_match = config.replay_match self.replay_size = config.replay_size self.observe_id = config.observe_id self.replay_obs = [] self.replay_act = [] self.replay_rew = [] self.replay_done = [] self.replay_next = [] self.viewer = None
def __init__(self, config, game_name, env): if config.model == "DNN": model = Perceptron(game_name, None, config, "MSE") else: model = Cnn(game_name, None, config, "MSE") Reinforce_Suite.__init__(self, config, model, env) self.replay_match = config.replay_match self.replay_size = config.replay_size self.observe_id = config.observe_id self.replay_obs = [] self.replay_act = [] self.replay_rew = [] self.replay_done = [] self.replay_next = [] self.replay_s = [] self.replay_ns = [] self.debug = config.debug self.viewer = None self.sample_len = 0
class ActorCritic(Reinforce_Suite): def __init__(self, config, game_name, env): Actor = Perceptron(game_name, None, config, "CE", attribute="Actor") Reinforce_Suite.__init__(self, config, Actor, env) self.Critic = Perceptron(game_name, None, config, "MSE", attribute="Critic") self.replay_match = config.replay_match self.replay_size = config.replay_size self.observe_id = config.observe_id self.on_policy = config.on_policy self.replay_switch = config.replay_switch self.replay_obs = [] self.replay_act = [] self.replay_rew = [] self.replay_done = [] self.replay_next = [] self.viewer = None def Gen_Batch_Data(self, policy, epoch_num): batchs = [] for epoch in range(epoch_num): samples = random.sample(range(len(self.replay_obs)), self.model.batch_size) samples_obs = [self.replay_obs[i] for i in samples] samples_act = [self.replay_act[i] - 1 for i in samples] samples_epr = [] samples_Q = [] for i in samples: if self.replay_done[i]: samples_Q.append(self.replay_rew[i]) samples_epr.append(self.replay_rew[i]) else: p, Q = self.Critic.test_model([self.replay_obs[i]]) if self.on_policy: samples_Q.append(self.replay_rew[i] + self.gamma * Q[0, self.replay_act[i + 1] - 1]) else: samples_Q.append(self.replay_rew[i] + self.gamma * np.max(Q)) samples_epr.append(Q[0, self.replay_act[i] - 1]) #print samples_Q tup = (samples_obs, samples_act, samples_epr, samples_Q) batchs.append(tup) return batchs def Get_Data(self, policy): observation = self.env.reset() match = 0 over_reward = 0 max_reward = -21.0 min_reward = 0 match_rerward = 0.0 show_flag = 1 if not self.replay_switch: self.replay_obs = [] self.replay_act = [] self.replay_rew = [] self.replay_done = [] self.replay_next = [] if len(self.replay_obs) == self.replay_size: del self.replay_obs[0] del self.replay_done[0] del self.replay_next[0] del self.replay_rew[0] del self.replay_act[0] self.replay_obs.append(observation) while True: action, Q, Q_debug = policy.action_sel(observation, max_sel=False) if len(self.replay_obs) > self.replay_size: del self.replay_obs[0] del self.replay_done[0] del self.replay_next[0] del self.replay_rew[0] del self.replay_act[0] #self.env.render() #replay strategy # if self.observe_id < len(self.replay_obs): # self.observe_picture = self.replay_obs[self.observe_id][25:,:,:] # if (observation[25:,:,:] == self.observe_picture).all(): # if self.viewer is None: # self.viewer = rendering.SimpleImageViewer() # if show_flag == 1: # self.viewer.imshow(observation[25:,:,:]) # show_flag = 0 # print "observe id: {}, action: {}, Q: {}".format(self.observe_id, action, Q_debug) #raw_input("trace image is here (Enter go): "); observation, reward, done, info = self.env.step(action) self.replay_rew.append(reward) self.replay_done.append(done) self.replay_act.append(action) if not done: over_reward += reward match_rerward += reward self.replay_next.append(observation) self.replay_obs.append(observation) else: if match_rerward > max_reward: max_reward = match_rerward elif match_rerward < min_reward: min_reward = match_rerward match_rerward = 0 self.replay_next.append(observation) match += 1 if match == self.replay_match: return over_reward / self.replay_match, max_reward, min_reward observation = self.env.reset() self.replay_obs.append(observation) def Train_Data(self, policy, train_epoch, train_data): #samples = self.random_sampling() #print [self.replay_Q[i] for i in samples] #print "sample ok" #print len(self.replay_obs) #print len(self.replay_act) #print len(self.replay_rew) #print len(self.replay_next) #print self.replay_Q samples_obs, samples_act, samples_epr, samples_Q = train_data self.Train_Critic(train_data, train_epoch) #print samples_epr policy.model.train_model(samples_obs, samples_act, samples_epr, samples_Q, train_epoch) def Train_Critic(self, train_data, train_epoch): #print "Train Critic" samples_obs, samples_act, samples_epr, samples_Q = train_data #print samples_Q samples_epr = [1.0 for i in range(len(samples_epr))] self.Critic.train_model(samples_obs, samples_act, samples_epr, samples_Q, train_epoch)
class DPS(Reinforce_Suite): def __init__(self, config, game_name, env): Reinforce_Suite.__init__(self, config, model, env) self.config = config self.game_name = game_name self.model = Perceptron(self.game_name, None, self.config, "MSE", direct=True) self.reg_model = AutoDecoder(self.game_name, None, self.config, "MSE") self.policy_reserve = Policy(self.model, self.config.episilon) self.replay_match = config.replay_match self.obs = [] self.replay_size = config.replay_size self.pre = config.pre self.reg_epoch = config.reg_epoch def Pre_Reg(self, policy): if len(self.obs) == 0: mean_reward, max_reward, min_reward = self.Get_Data( self.policy_reserve) obs_batch = self.Gen_Batch_Data(self.policy_reserve, self.reg_epoch) for i in range(self.reg_epoch): self.reg_model.train_model(obs_batch[i], None, None, None, i) return self.reg_model.get_w() def Policy_Search(self): def ackley(solution): value = [] if self.pre: w1 = self.Pre_Reg(self.policy_reserve) w_1_dim = (self.config.feature_size, self.config.hidden_size) w_2_dim = (self.config.hidden_size, self.config.action_size) w_flat = solution.get_x() if not self.pre: value.append( np.reshape(w_flat[0:w_1_dim[0] * w_1_dim[1]], w_1_dim)) value.append( np.reshape(w_flat[w_1_dim[0] * w_1_dim[1]:], w_2_dim)) else: value.append(w1) value.append(np.reshape(w_flat, w_2_dim)) self.model._assign(value) self.model.train_model(None, None, None, None, None) mean_reward, max_reward, min_reward = self.Get_Data( self.policy_reserve) print "eval max_reward: {}, min_reward: {}, mean_reward: {}".format( max_reward, min_reward, mean_reward) return -mean_reward if not self.pre: dim = self.config.feature_size * self.config.hidden_size + self.config.hidden_size * self.config.action_size else: dim = self.config.hidden_size * self.config.action_size obj = Objective(ackley, Dimension(dim, [[-0.01, 0.01]] * dim, [True] * dim)) solution = Opt.min( obj, Parameter(budget=100 * dim, uncertain_bits=100, intermediate_result=False, intermediate_freq=1)) solution.print_solution() def Get_Data(self, policy): observation = self.env.reset() if len(self.obs) == self.replay_size: del self.obs[0] self.obs.append(observation) match = 0 over_reward = 0 max_reward = -21.0 min_reward = 0 match_rerward = 0.0 observation_batch = [observation] while True: #self.env.render() action, Q, Q_debug = policy.action_sel(observation_batch) observation, reward, done, info = self.env.step(action) if len(self.obs) == self.replay_size: del self.obs[0] self.obs.append(observation) if not done: over_reward += reward match_rerward += reward observation_batch = [observation] else: if match_rerward > max_reward: max_reward = match_rerward elif match_rerward < min_reward: min_reward = match_rerward match_rerward = 0 match += 1 if match == self.replay_match: return over_reward / self.replay_match, max_reward, min_reward observation = self.env.reset() observation_batch = [observation] def Gen_Batch_Data(self, policy, epoch_num): batchs = [] for epoch in range(epoch_num): samples = random.sample(range(len(self.obs)), self.reg_model.batch_size) samples_obs = [self.obs[i] for i in samples] batchs.append(samples_obs) return batchs
#strip:去掉每行字符串首尾指定的字符(默认空格或换行符) #split:按照指定的字符将字符串切割成每个字段,返回列表形式 curLine = line.strip().split(',') #将每行中除标记外的数据放入数据集中(curLine[0]为标记信息) #在放入的同时将原先字符串形式的数据转换为整型 #此外将数据进行了二值化处理,大于128的转换成1,小于的转换成1,方便后续计算 dataArr.append([int(int(num) > 128) for num in curLine[1:]]) #将标记信息放入标记集中 #放入的同时将标记转换为整型 labelArr.append(1 if int(curLine[0]) == 1 else -1) #返回数据集和标记 return np.array(dataArr), np.array(labelArr) if __name__ == '__main__': # 读取数据 data, labels = load_data() # 打乱数据并分割数据集 x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=10) # 创建感知机分类器对象 model = Perceptron(len(x_train[0])) # 训练感知机模型并开始训练 train_acc = model.train(x_train, y_train) print('train acc:' + str(train_acc)) # 对测试集进行测试 test_acc = model.test(x_test, y_test) print('test acc:' + str(test_acc))
class Policy_Generator(Reinforce_Suite): def __init__(self, config, game_name, env): Reinforce_Suite.__init__(self, config, model, env) self.config = config self.game_name = game_name self.model = Perceptron(self.game_name, None, self.config, "MSE", direct = True) self.policy_gen = GAN_for_Policy(self.game_name, None, self.config, "MSE") self.policy_reserve = Policy(self.model, self.config.episilon) self.replay_match = config.replay_match self.obs = [] self.replay_size = config.replay_size self.pre = config.pre self.sample_size = config.sample_size self.epoch = config.epoch self.train_epoch = config.train_epoch def Get_Data(self, policy): observation = self.env.reset() if len(self.obs) == self.replay_size: del self.obs[0] self.obs.append(observation) match = 0 over_reward = 0 max_reward = -21.0 min_reward = 0 match_rerward = 0.0 observation_batch = [observation] while True: #self.env.render() action, Q, Q_debug = policy.action_sel(observation_batch) observation, reward, done, info = self.env.step(action) if len(self.obs) == self.replay_size: del self.obs[0] self.obs.append(observation) if not done: over_reward += reward match_rerward += reward observation_batch = [observation] else: if match_rerward > max_reward: max_reward = match_rerward elif match_rerward < min_reward: min_reward = match_rerward match_rerward = 0 match += 1 if match == self.replay_match: return over_reward / self.replay_match, max_reward, min_reward observation = self.env.reset() observation_batch = [observation] def Policy_Search(self): for epoch in range(self.epoch): min_score, min_W, max_score, max_W, mean_score = self.Policy_Opt(epoch) print "epoch {}, min_socre: {}, max_score: {}, mean_score: {}".format(self.epoch, min_score, max_score, mean_score) def Policy_Opt(self, epoch): def ackley(w_flat): value = [] w_1_dim = (self.config.feature_size, self.config.hidden_size) w_2_dim = (self.config.hidden_size, self.config.action_size) value.append(np.reshape(w_flat[0:w_1_dim[0] * w_1_dim[1]], w_1_dim)) value.append(np.reshape(w_flat[w_1_dim[0] * w_1_dim[1]:], w_2_dim)) self.model._assign(value) self.model.train_model(None, None, None, None, None) mean_reward, max_reward, min_reward = self.Get_Data(self.policy_reserve) print "eval max_reward: {}, min_reward: {}, mean_reward: {}".format(max_reward, min_reward, mean_reward) return mean_reward z_sample = self.policy_gen._noise_gen(self.sample_size) feed = {self.policy_gen.tf_noise: z_sample} W_fake = self.policy_gen.sess.run([self.policy_gen.W_fake], feed_dict=feed) count = len(self.policy_gen.w_samples.keys()) min_score = 20.0 max_score = -20.0 mean_score = 0.0 for i in range(self.sample_size): score = ackley(W_fake[0][i]) if min_score > score: min_score = score min_W = W_fake[0][i] if max_score < score: max_score = score max_W = W_fake[0][i] mean_score += score / self.sample_size self.policy_gen.w_samples[count] = W_fake[0][i] self.policy_gen.w_scores[count] = score count += 1 self.sort_sample = sorted(self.policy_gen.w_scores.items(), key=lambda x: x[1]) pos_batches = [] for i in range(self.policy_gen.batch_size): pos_batches.append(self.policy_gen.w_samples[self.sort_sample[i][0]]) for iter in range(self.train_epoch): samples = random.sample(range(len(self.policy_gen.w_samples) - self.policy_gen.batch_size), self.policy_gen.batch_size) z_sample = self.policy_gen._noise_gen(2 * self.policy_gen.batch_size) neg_batches = [] for i in samples: neg_batches.append(self.policy_gen.w_samples[self.sort_sample[i + self.policy_gen.batch_size][0]]) pos_labels = np.ones(self.policy_gen.batch_size) neg_labels = np.zeros(self.policy_gen.batch_size) labels = np.reshape(np.concatenate([pos_labels, neg_labels], axis=0), [2 * self.policy_gen.batch_size, -1]) data = (z_sample, pos_batches + neg_batches, labels) self.policy_gen.train_model(data, None, None, None, iter) return min_score, min_W, max_score, max_W, mean_score