class LeafSet(object): __slots__ = ('peers', 'capacity') __passthru = {'get', 'clear', 'pop', 'popitem', 'peekitem', 'key'} __iters = {'keys', 'values', 'items'} def __init__(self, my_key, iterable=(), capacity=8): try: iterable = iterable.items() # view object except AttributeError: pass tuple_itemgetter = Peer.distance(my_key, itemgetter(0)) key_itemgetter = Peer.distance(my_key) self.capacity = capacity self.peers = SortedDict(key_itemgetter) if iterable: l = sorted(iterable, key=tuple_itemgetter) self.peers.update(islice(l, capacity)) def clear(self): self.peers.clear() def prune(self): extra = len(self) - self.capacity for i in range(extra): self.peers.popitem(last=True) def update(self, iterable): try: iterable = iterable.items() # view object except AttributeError: pass iterable = iter(iterable) items = tuple(islice(iterable, 500)) while items: self.peers.update(items) items = tuple(islice(iterable, 500)) def setdefault(self, *args, **kwargs): self.peers.setdefault(*args, **kwargs) self.prune() def __setitem__(self, *args, **kwargs): self.peers.__setitem__(*args, **kwargs) self.prune() def __getitem__(self, *args, **kwargs): return self.peers.__getitem__(*args, **kwargs) def __delitem__(self, *args, **kwargs): return self.peers.__delitem__(*args, **kwargs) def __iter__(self, *args, **kwargs): return self.peers.__iter__(*args, **kwargs) def __reversed__(self, *args, **kwargs): return self.peers.__reversed__(*args, **kwargs) def __contains__(self, *args, **kwargs): return self.peers.__contains__(*args, **kwargs) def __len__(self, *args, **kwargs): return self.peers.__len__(*args, **kwargs) def __getattr__(self, key): if key in self.__class__.__passthru: return getattr(self.peers, key) elif key in self.__class__.__iters: return getattr(self.peers, 'iter' + key) else: return super().__getattr__(key) def __repr__(self): return '<%s keys=%r capacity=%d/%d>' % ( self.__class__.__name__, list(self), len(self), self.capacity)
class RL(): def __init__(self, sess, env, n_s, n_a, args): if sess is None: self.sess = tf.Session() else: self.sess = sess self.args = args self.env = env self.env.seed(self.args.seed) self.n_s = n_s self.n_a = n_a self.init = True ### Use to detect is in init state or not, if Yes use random action self.ite_count = 0 ### Num of iter self.dict = SortedDict() self.release = 10 self.reward_ = 200 ### use to clip reward self.save_index = 0 ### Save index self.network_model() self.saver = tf.compat.v1.train.Saver() tf.compat.v1.random.set_random_seed(args.seed) def network_model(self): def init_weights(input_): x = 1 / (np.sqrt(input_)) return tf.compat.v1.random_uniform_initializer(-x, x, seed=self.args.seed) def behavior_build_network(): w_init = tf.compat.v1.initializers.variance_scaling( scale=np.sqrt(2 / (1 + np.sqrt(5)**2)), distribution='uniform', seed=self.args.seed) l1 = tf.layers.dense( inputs=self.input_, units=self.args.hidden_units, kernel_initializer=w_init, bias_initializer=init_weights(self.n_s), activation='sigmoid', name='l1', ) c1 = tf.layers.dense(inputs=tf.concat((self.d_r, self.d_h), 1), units=self.args.hidden_units, kernel_initializer=w_init, bias_initializer=init_weights( self.args.hidden_units), activation='sigmoid', name='c_out') out_1 = tf.math.multiply(l1, c1) l2 = tf.layers.dense( inputs=out_1, units=self.n_a, activation=None, kernel_initializer=w_init, bias_initializer=init_weights(self.args.hidden_units), name='l2', ) # b=tf.layers.dense( # inputs=l2, # units=self.n_a, # kernel_initializer=w_init, # bias_initializer=init_weights(self.args.hidden_units), # activation=None, # name='out' # ) b = l2 return b ### ALL input self.input_ = tf.compat.v1.placeholder(tf.float32, [None, self.n_s], 'input_') self.c_in = tf.compat.v1.placeholder(tf.float32, [None, 2], 'c_in') self.d_h = tf.compat.v1.placeholder(tf.float32, [None, 1], 'd_h') self.d_r = tf.compat.v1.placeholder(tf.float32, [None, 1], 'd_r') self.a = tf.compat.v1.placeholder(tf.int32, [ None, ], 'action') with tf.compat.v1.variable_scope('behavior_function'): self.b = behavior_build_network() self.b_softmax = tf.nn.softmax(self.b) self.a_out = tf.squeeze( tf.random.categorical(logits=self.b, num_samples=1, seed=self.args.seed)) with tf.compat.v1.variable_scope('loss'): self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.b, labels=self.a)) with tf.compat.v1.variable_scope('train'): self.train_op = tf.compat.v1.train.AdamOptimizer( self.args.lr).minimize(self.loss) def action_choice(self, s, c, dr, dh): s = np.asarray(s, dtype=np.float32).reshape((1, self.n_s)) dr = np.asarray(dr).reshape((-1, 1)) dh = np.asarray(dh).reshape((-1, 1)) action = self.sess.run(self.a_out, { self.input_: s, self.d_r: dr, self.d_h: dh }) return action def get_(self): if self.init: self.desire_r_init, self.desire_h_init = 0, 0 return h = [] r = [] for _ in range(self.args.generate_per_single_training): epoides = self.dict.popitem() h.append(len(epoides[1][0])) r.append(epoides[0]) seleceted_eposide_len = np.mean(h) seleceted_eposide_mean = np.random.uniform(low=np.mean(r), high=(np.mean(r) + np.std(r))) self.desire_r_init, self.desire_h_init = seleceted_eposide_mean, seleceted_eposide_len def feed(self): self.get_() self.dict.clear() for _ in range(self.args.memory_thersold): state, action, reward, total_reward = self.play() self.dict.__setitem__(total_reward, (state, action, reward)) self.init = False def play(self): s = self.env.reset() if self.ite_count == 0: self.sess.run(tf.compat.v1.global_variables_initializer()) state_list = [] action_list = [] reward_list = [] reward_total = 0 done = False desire_h = self.desire_h_init desire_r = self.desire_r_init while not done: c = np.asarray([desire_h, desire_r]) if self.init: a = np.random.randint(self.n_a) else: a = self.action_choice(s, c, desire_r, desire_h) s_, r, done, _ = self.env.step(a) state_list.append(s) action_list.append(a) reward_list.append(r) reward_total += r desire_h = max(desire_h - 1, 1) desire_r = min(desire_r - r, self.reward_) s = s_ if done: break return state_list, action_list, reward_list, reward_total def learn(self): if self.ite_count == 0: self.sess.run(tf.compat.v1.global_variables_initializer()) memory_dic = dict(self.dict) dic_value = list(memory_dic.values()) for _ in range(self.args.n_update_eposide): state = [] dr = [] dh = [] true_a = [] c = [] indices = np.random.choice( len(dic_value), self.args.batch_size, replace=True) ######### random sample which eposide will use. tran = [dic_value[i] for i in indices] random_index = [np.random.choice(len(e[0]) - 2, 1) for e in tran] for idx_, tran_ in zip(random_index, tran): state.append(tran_[0][idx_[0]]) dr.append(np.sum(tran_[2][idx_[0]:])) dh.append(len(tran_[0]) - idx_[0]) true_a.append(tran_[1][idx_[0]]) c.append([np.sum(tran_[2][idx_[0]:]), len(tran_[0]) - idx_[0]]) command_ = np.asarray(c, dtype=np.float32).reshape(-1, 2) s_t = np.asarray(state, dtype=np.float32) action = np.asarray([a_ for a_ in true_a]) dr = np.asarray(dr, dtype=np.float32).reshape((-1, 1)) dh = np.asarray(dh, dtype=np.float32).reshape((-1, 1)) _, loss = self.sess.run( [self.train_op, self.loss], { self.input_: s_t, self.c_in: command_, self.a: action, self.d_r: dr, self.d_h: dh }) def eval(self, eval_ite): test_reward = [] test_step = [] for i in range(self.args.eval_step): _, _, r_list, total_reward = self.play() test_reward.append(total_reward) test_step.append(len(r_list)) print('ite: {}, reward: {:.3f},'.format(eval_ite, np.mean(test_reward))) return np.mean(test_reward) def train(self): self.feed() test = [] print( '----------------using tensorflow with {} generate_step_per_single_training----------------' .format(self.args.generate_per_single_training)) while True: self.learn() self.ite_count += 1 self.feed() if (self.ite_count - 1) % self.args.eval_step_every_k_step == 0: score = self.eval(self.ite_count - 1) test.append(score) if len(test) % self.release == 0 or (self.ite_count - 1) == 0: # self.saver.save(self.sess,r'C:\Users\USER\Desktop\Upside down\new folder\result\memory_thersold\tensorflow_model_{}_tensorflow_categorical_1.ckpt'.format(self.args.generate_per_single_training)) self.saver.save( self.sess, self.args.save_path + '\\' + 'tensorflow_model_{}_tensorflow_categorical_1.ckpt'. format(self.args.generate_per_single_training)) # np.save(r'C:\Users\USER\Desktop\Upside down\new folder\result\memory_thersold\tensorflow_reward_test_{}_{}__tensorflow_categorical_1.npy'.format(self.save_index,self.args.generate_per_single_training),test) print('saved') self.save_index += 1 test = [] print((time.time() - start_) / 60) start_ = time.time()
class UpsideDownRL(object): def __init__(self, env, args): super(UpsideDownRL, self).__init__() self.env = env self.args = args self.nb_actions = self.env.action_space.n self.state_space = self.env.observation_space.shape[0] # Use sorted dict to store experiences gathered. # This helps in fetching highest reward trajectories during exploratory stage. self.experience = SortedDict() self.B = BehaviorFunc(self.state_space, self.nb_actions, args).cuda() self.optimizer = optim.Adam(self.B.parameters(), lr=self.args.lr) self.use_random_actions = True # True for the first training epoch. self.softmax = nn.Softmax() # Used to clip rewards so that B does not get unrealistic expected reward inputs. self.lunar_lander_max_reward = 250 # Generate an episode using given command inputs to the B function. def gen_episode(self, dr, dh): state = self.env.reset() episode_data = [] states = [] rewards = [] actions = [] total_reward = 0 while True: action = self.select_action(state, dr, dh) next_state, reward, is_terminal, _ = self.env.step(action) if self.args.render: self.env.render() states.append(state) actions.append(action) rewards.append(reward) total_reward += reward state = next_state dr = min(dr - reward, self.lunar_lander_max_reward) dh = max(dh - 1, 1) if is_terminal: break return total_reward, states, actions, rewards # Fetch the desired return and horizon from the best trajectories in the current replay buffer # to sample more trajectories using the latest behavior function. def fill_replay_buffer(self): dr, dh = self.get_desired_return_and_horizon() self.experience.clear() for i in range(self.args.replay_buffer_capacity): total_reward, states, actions, rewards = self.gen_episode(dr, dh) self.experience.__setitem__(total_reward, (states, actions, rewards)) if self.args.verbose: if self.use_random_actions: print("Filled replay buffer with random actions") else: print("Filled replay buffer using BehaviorFunc") self.use_random_actions = False def select_action(self, state, desired_return=None, desired_horizon=None): if self.use_random_actions: action = np.random.randint(self.nb_actions) else: action_prob = self.B( torch.from_numpy(state).cuda(), torch.from_numpy(np.array(desired_return, dtype=np.float32)).reshape(-1, 1).cuda(), torch.from_numpy( np.array(desired_horizon, dtype=np.float32).reshape(-1, 1)).cuda()) action_prob = self.softmax(action_prob) # create a categorical distribution over action probabilities dist = Categorical(action_prob) action = dist.sample().item() return action # Todo: don't popitem from the experience buffer since these best-performing trajectories can have huge impact on learning of B def get_desired_return_and_horizon(self): if (self.use_random_actions): return 0, 0 h = [] r = [] for i in range(self.args.explore_buffer_len): episode = self.experience.popitem() # will return in sorted order h.append(len(episode[1][0])) r.append(episode[0]) mean_horizon_len = np.mean(h) mean_reward = np.random.uniform(low=np.mean(r), high=np.mean(r) + np.std(r)) return mean_reward, mean_horizon_len def trainBehaviorFunc(self): experience_dict = dict(self.experience) experience_values = list(experience_dict.values()) for i in range(self.args.train_iter): state = [] dr = [] dh = [] target = [] indices = np.random.choice(len(experience_values), self.args.batch_size, replace=True) train_episodes = [experience_values[i] for i in indices] t1 = [np.random.choice(len(e[0]) - 2, 1) for e in train_episodes] for pair in zip(t1, train_episodes): state.append(pair[1][0][pair[0][0]]) dr.append(np.sum(pair[1][2][pair[0][0]:])) dh.append(len(pair[1][0]) - pair[0][0]) target.append(pair[1][1][pair[0][0]]) self.optimizer.zero_grad() state = torch.from_numpy(np.array(state)).cuda() dr = torch.from_numpy( np.array(dr, dtype=np.float32).reshape(-1, 1)).cuda() dh = torch.from_numpy( np.array(dh, dtype=np.float32).reshape(-1, 1)).cuda() target = torch.from_numpy(np.array(target)).long().cuda() action_logits = self.B(state, dr, dh) loss = nn.CrossEntropyLoss() output = loss(action_logits, target).mean() output.backward() self.optimizer.step() # Evaluate the agent using the initial command input from the best topK performing trajectories. def evaluate(self): testing_rewards = [] testing_steps = [] dr, dh = self.get_desired_return_and_horizon() for i in range(self.args.evaluate_trials): total_reward, states, actions, rewards = self.gen_episode(dr, dh) testing_rewards.append(total_reward) testing_steps.append(len(rewards)) print("Mean reward achieved : {}".format(np.mean(testing_rewards))) return np.mean(testing_rewards) def train(self): # Fill replay buffer with random actions for the first time. self.fill_replay_buffer() iterations = 0 test_returns = [] while True: # Train behavior function with trajectories stored in the replay buffer. self.trainBehaviorFunc() self.fill_replay_buffer() if iterations % self.args.eval_every_k_epoch == 0: test_returns.append(self.evaluate()) torch.save(self.B.state_dict(), os.path.join(self.args.save_path, "model.pkl")) np.save(os.path.join(self.args.save_path, "testing_rewards"), test_returns) iterations += 1
class Tweet(): def __init__(self, text=None): if self.isstatusurl(text): self.__dict__ = TwitterUser().api.get_status( text.split('/')[-1]).__dict__ elif isinstance(text, str) and os.path.isfile(text): self.photo_file = text elif not text: print('想好发啥再戳我😡😡😡') else: self._raw = text self._raw_chars = len(self._raw) self.wash() self.divide() self.pool() urls = self.extractURL(self._raw) if urls: if len(urls) > 5: raise TooManyHyperLinks else: self.raw_urls = urls logging.info('计算原文中 url 个数和字符数...') self.raw_urls_chars = sum( [len(url) for url in self.raw_urls]) logging.info('原文中{}个 url,共{}个字符'.format( len(urls), self.raw_urls_chars)) logging.info('正在生成短链接...') self.shorten() logging.info('短链接搞定') self.shortens_chars = sum( [len(url) for url in self.shortens]) self.buildOffset() self.tailor() @classmethod def extractURL(self, text): return re.findall(urlmarker.WEB_URL_REGEX, text) @classmethod def isurl(self, text): url = self.extractURL(text) if len(url) != 1 or len(set([text] + url)) != 1: return False else: return True @classmethod def isstatusurl(self, text): if self.isurl(text): if 'twitter.com' in text and 'status' in text: return True else: return False else: return False def wash(self): ''' remove newlines and more than one spaces ''' text = self._raw workspace = [] for i in range(1, len(text)): if not text[i - 1].isspace(): workspace.append(text[i - 1]) elif not text[i].isspace(): workspace.append(text[i - 1]) text.replace('', ' ') if not text[-1].isspace(): self.washed = ''.join(workspace) + text[-1] else: self.washed = ''.join(workspace) self.washed_chars = len(self.washed) def shorten(self): try: logging.debug('切换到开发账号...') dev_user = TwitterUser() logging.debug('开发账号切换成功,username: '******'生成中间 tweet...') middle_tweet = dev_user.api.update_status(self.raw_urls) logging.debug('获取短链...') self.shortens = [ url['url'] for url in middle_tweet.entities['urls'] ] logging.debug('一切正常,删除中间 tweet...') middle_tweet.destroy() logging.debug('中间 tweet 已被删除...') except Exception as e: print('something wrong with shorten(), ' + e) def divide(self): if fmod(self.washed_chars, 280) != 0: self.n_pages = floor(self.washed_chars / 280) + 1 else: self.n_pages = floor(self.washed_chars / 280) def pool(self): # newline separated parts self.nsp = [ item for item in self.washed.split('\n') if item is not None ] self.pool = flatten( [item.split(' ') for item in self.nsp if item is not None]) def buildOffset(self): self.offset = SortedDict({0: self.pool[0]}) for i in range(1, len(self.pool)): self.offset.__setitem__( self.offset.keys()[i - 1] + len(self.pool[i - 1]) + 1, self.pool[i]) def buildPages(self): dividers = [n * 280 for n in range(1, self.n_pages)] indices = SortedList( set(dividers + [key for key in self.offset.keys()])) pages = SortedDict() for i in range(0, self.n_pages): # first page if i is 0: page = indices[0:indices.index(dividers[0]) - 1] # last page elif i is self.n_pages - 1: page = indices[indices.index(dividers[i - 1]) - 1:] else: page = indices[indices.index(dividers[i - 1]):indices. index(dividers[i])] pages[i] = page # remove dividers that hasn't been existed for key in pages: for item in pages[key]: if item not in self.offset: pages[key].remove(item) self.pagination = SortedDict() for i in range(self.n_pages): temp_key_list = [key for key in pages[i]] page_pool = [] for key in temp_key_list: page_pool.append(self.offset[key]) self.pagination[i] = ' '.join(page_pool) def tailor(self): # deal with urls if self.washed_chars > 280: # when text contains urls if not self.raw_urls: self.buildPages() self.tailored = self.pagination else: # when length still over 280 if (self.washed_chars - self.raw_urls_chars) + self.shortens_chars > 280: self.buildPages() self.tailored = self.pagination else: self.tailored = self.washed # simple case else: self.tailored = self.washed