def collecting_training_data(self, epoch=1): self.data_set = [] diversitys = [] for _ in range(64): rewards = 0 trajectory = [] cur_state = self.env.reset() terminal = False while not terminal: action = self.training_agent.tompson_sampling( cur_state, trajectory, max([ self.config.temperature / (epoch + 1e-10), self.config.mini_temperature ])) next_state, reward, terminal, div = self.env.step(action) diversitys.append(div) trajectory.append((cur_state, action, reward)) cur_state = next_state rewards += reward self.data_set.append(trajectory) log.info( "finish collecting training data", np.mean([len(item) for item in self.data_set]), "average click", np.mean([np.sum([i[2] for i in item]) for item in self.data_set]), "average depth", np.mean([len(item) for item in self.data_set]), "diversity", np.mean(diversitys))
def create_model(cls, config, variable_scope="target", trainable=True, graph_name="DEFAULT"): log.info("CREATE MODEL", config.model, "GRAPH", graph_name, "VARIABLE SCOPE", variable_scope) if not graph_name in cls.GRAPHS: log.info("Adding a new tensorflow graph:", graph_name) cls.GRAPHS[graph_name] = tf.Graph() with cls.GRAPHS[graph_name].as_default(): model = cls(config, variable_scope=variable_scope, trainable=trainable) if not graph_name in cls.SESS: cls.SESS[graph_name] = tf.Session(config=tf.ConfigProto( gpu_options=config.GPU_OPTION)) cls.SAVER[graph_name] = tf.train.Saver(max_to_keep=50) cls.SESS[graph_name].run(model.init) return { "graph": cls.GRAPHS[graph_name], "sess": cls.SESS[graph_name], "saver": cls.SAVER[graph_name], "model": model }
def evaluate(self, num): import tensorflow as tf path = os.path.join(self.config.saved_model_path, self.config.model_id + "_" + str(num)) self.pv_agent['saver'].restore(self.pv_agent['sess'], tf.train.latest_checkpoint(path)) r = [] uidss = np.random.choice(range(1, self.config.user_num), (self.config.evaluate_num, ), replace=False) for uid in uidss: try: self.env.reset4evaluate(uid) terminal = False reward = 0 while not terminal: _action_probs, next_state_value = self.pv_agent[ "model"].get_actions_probability_model( self.pv_agent["sess"], self.env) act, probability = zip(*_action_probs) action = act[np.argmax(probability)] candidate, node_type, reward, terminal = self.env.step( action) r.append((reward, self.env.accuracy, self.env.diversity)) except: pass print("####" * 5) f1, a, d = zip(*r) log.info("evaluate", num, "average_reward", np.mean(f1), "accuracy", np.mean(a), "diversity", np.mean(d)) pass
def collecting_training_data(self): if os.path.isfile(self.config.save_path): with open(self.config.save_path, "r") as f: for line in f.readlines(): self.data_set.append(eval(line.strip("\n"))) else: pbar = ProgressBar() diversitys = [] for _ in pbar(range(self.config.trajectory_number)): trajectory = [] cur_state = self.env.reset() terminal = False while not terminal: action = self.env.sampling() next_state, reward, terminal, div = self.env.step(action) trajectory.append((cur_state, action, reward)) diversitys.append(div) cur_state = next_state self.data_set.append(trajectory) log.info( "finish collecting training data", np.mean([len(item) for item in self.data_set]), "average click", np.mean([ np.sum([i[2] for i in item]) for item in self.data_set ]), "diversity", np.mean(diversitys)) with open(self.config.save_path, "w") as f: for item in self.data_set: f.writelines(str(item) + "\n")
def collecting_training_data(self, epoch): self.data_set = [] log.info( "temperature is ", epoch, max([ self.config.temperature / (epoch + 1e-10), self.config.mini_temperature ])) for _ in range(64): rewards = 0 trajectory = [] cur_state = self.env.reset() terminal = False while not terminal: action = self.rec_agent.tompson_sampling( cur_state, trajectory, max([ self.config.temperature / (epoch + 0.1e10), self.config.mini_temperature ])) next_state, reward, terminal = self.env.step(action) trajectory.append((cur_state, action, reward)) cur_state = next_state rewards += reward self.data_set.append(trajectory) log.info( "fake environment", self.epoch, "clicks", np.mean([np.sum([i[2] for i in item]) for item in self.data_set]), "depth", np.mean([len(item) for item in self.data_set]))
def update_simulator(self, data_set=[]): action_probability = self.get_action_probability(data_set) * 0.01 index = 00 for trajectory in data_set: probability = 1.0 for i, sar in enumerate(trajectory): state, action, reward = sar probability *= action_probability[ index] / self.env.get_probability(state, action) pp_a = min([probability, self.config.maximum_weight]) if reward > 0: if i == len(trajectory) - 1: self.memory.put( (state, action, trajectory[:i], reward, 1.0, pp_a), 0) else: self.memory.put( (state, action, trajectory[:i], reward, 0.0, pp_a), 1) else: if i == len(trajectory) - 1: self.memory.put( (state, action, trajectory[:i], reward, 1.0, pp_a), 2) else: self.memory.put( (state, action, trajectory[:i], reward, 0.0, pp_a), 3) index += 1 batch = self.memory.sample_batch(self.config.batch_size) uid = [i[0] for i in batch] iid = [i[1] for i in batch] label = [i[3] for i in batch] ts = [i[2] for i in batch] terminate = [i[4] for i in batch] weight = [i[5] for i in batch] log.info("training ratio", np.mean(terminate), "reward", np.mean(label), "weight", np.mean(weight)) traj, feedbacks, target_index = self.convert_item_seq2matrix( [[ii[1] for ii in item] for item in ts], [[ii[2] for ii in item] for item in ts]) data = { "uid": uid, "iid": iid, "label": label, "trajectory": traj, "feedback": feedbacks, "target_index": target_index, "terminate": terminate, "weight": weight } loss = self.simulator["model"].optimize_model(self.simulator["sess"], data) p_c, p_t = self.simulator["model"].predict(self.simulator["sess"], data) self.env.set_click_terminate_threshold(np.mean(p_c), np.mean(p_t)) log.info("loss for simulator", loss) return loss
def run(self): for i in range(self.config.epoch): self.collecting_training_data(i) loss = self.training_agent.update_model(self.data_set) log.info("training epoch", i, 'loss', loss) if i % 200 == 0: click, length, diversity = self.evaluate() log.info("epoch", i, "average click", click, "depth", length, "div", np.mean(diversity))
def train(config, env, task_index, gpu_index, lock=Lock): buffer = type_memory(2, config.memory_capacity) from function_approximation import rnn_model import utils np.random.seed(int(time.time())) os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_index) # lock.acquire() pv_agent = rnn_model.create_model(config, task_index=task_index) # utils.save_model(pv_agent["saver"], # utils.create_saved_path(0, config.saved_model_path, config.model_id), # pv_agent["sess"], # pv_agent["model"].global_step) # lock.release() mcts = MCTS(pv_agent, config.c_puct, config.n_playout, config.discount_factor) for e in range(1, config.epoch + 1): average_reward = [] for i in range(config.update_frequency): print(task_index, "collecting data") data, reward = collecting_training_samples( config, mcts, env, config.temperature / (e + 0.000001)) average_reward.append(reward) for item in data: if item[4] == 0: buffer.put(item, 0) elif item[4] == 1: buffer.put(item, 1) print(task_index, "finish collecting") log.info(str(e), "process", str(task_index), "collecting trajectory reward", np.mean(average_reward)) batch = buffer.sample_batch(config.batch_size) lock.acquire() try: p1, v_1, p2, v_2 = pv_agent["model"].optimize_model_batch( pv_agent["sess"], batch) log.info("\t".join([ str(item) for item in [ e, "process", task_index, "policy_1", p1, "value_1", v_1, "policy2", p2, "value_2", v_2 ] ])) utils.save_model( pv_agent["saver"], utils.create_saved_path(e, config.saved_model_path, config.model_id), pv_agent["sess"], pv_agent["model"].global_step) except: pass lock.release()
def run(self): for i in range(self.config.epoch): if i % 200 == 0 and i >= self.config.evaluation_num and self.training_agent.evaluate_or_not( ): click, length, div = self.evaluate() log.info("epoch", i, "average click", click, "depth", length, "diversity", div) random.shuffle(self.data_set) batch = [ self.data_set[item] for item in np.random.choice(len(self.data_set), ( self.config.batch_size, )) ] loss = self.training_agent.update_model(batch) log.info("training epoch", i, 'loss', loss)
def update_model(self, data_set=[]): self.global_update_time += 1 if self.global_update_time <= self.config.evaluation_num: loss = self.update_simulator(data_set) self.update_env() else: if self.global_update_time % 10 == 0: self.collecting_training_data(self.epoch) self.update_rec_agent() loss_neg = self.update_simulator_negative() log.info("simulator negative loss", loss_neg) loss = self.update_simulator(data_set) log.info("simulator positive loss", loss) self.update_env() return loss
def update_simulator(self, data_set=[]): for trajectory in data_set: for i, sar in enumerate(trajectory): state, action, reward = sar if reward > 0: if i == len(trajectory) - 1: self.memory.put( (state, action, trajectory[:i], reward, 1.0), 0) else: self.memory.put( (state, action, trajectory[:i], reward, 0.0), 1) else: if i == len(trajectory) - 1: self.memory.put( (state, action, trajectory[:i], reward, 1.0), 2) else: self.memory.put( (state, action, trajectory[:i], reward, 0.0), 3) batch = self.memory.sample_batch(self.config.batch_size) uid = [i[0] for i in batch] iid = [i[1] for i in batch] label = [i[3] for i in batch] ts = [i[2] for i in batch] terminate = [i[4] for i in batch] log.info("training ratio", np.mean(terminate), "reward", np.mean(label)) traj, feedbacks, target_index = self.convert_item_seq2matrix( [[ii[1] for ii in item] for item in ts], [[ii[2] for ii in item] for item in ts]) weight = [1.0] * len(label) data = { "uid": uid, "iid": iid, "label": label, "trajectory": traj, "feedback": feedbacks, "target_index": target_index, "terminate": terminate, "weight": weight } loss = self.simulator["model"].optimize_model(self.simulator["sess"], data) log.info("loss for simulator", loss) return loss
def create_model(cls, config, variable_scope = "target", trainable = True, graph_name="DEFAULT",task_index=0): jobs = config.jobs job = list(jobs.keys())[0] log.info("CREATE MODEL", config.model, "GRAPH", graph_name, "VARIABLE SCOPE", variable_scope,"jobs",jobs,"job",job,"task_index",task_index) cls.CLUSTER = tf.train.ClusterSpec(jobs) cls.SERVER = tf.train.Server(cls.CLUSTER, job_name=job, task_index=task_index,config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) if not graph_name in cls.GRAPHS: log.info("Adding a new tensorflow graph:",graph_name) cls.GRAPHS[graph_name] = tf.Graph() with cls.GRAPHS[graph_name].as_default(): model = cls(config, variable_scope=variable_scope, trainable=trainable) if not graph_name in cls.SESS: cls.SESS[graph_name] = tf.Session(cls.SERVER.target) cls.SAVER[graph_name] = tf.train.Saver(max_to_keep=1000) cls.SESS[graph_name].run(model.init) return {"graph": cls.GRAPHS[graph_name], "sess": cls.SESS[graph_name], "saver": cls.SAVER[graph_name], "model": model,"cluster":cls.CLUSTER,"server":cls.SERVER}
def update_model(self, data_set=[]): if np.mean([len(item) for item in data_set]) == 1.0: return 0.0 temp_data = [] for trajectory in data_set: for i, sar in enumerate(trajectory): state, action, reward = sar if i == len(trajectory) - 1: temp_data.append( (state, action, trajectory[:i], reward, -1)) else: temp_data.append((state, action, trajectory[:i], reward, trajectory[:i + 1])) temp_data, next_max_q = self.get_next_q_value_tuple(temp_data) self.memory.extend(temp_data) if len(self.memory) >= self.config.buffer_size: self.memory = self.memory[-self.config.buffer_size:] batch = [ self.memory[item] for item in np.random.choice(len(self.memory), ( self.config.batch_size, )) ] uid = [i[0] for i in batch] iid = [i[1] for i in batch] label = [i[3] for i in batch] ts = [i[2] for i in batch] traj, feedbacks, target_index = self.convert_item_seq2matrix( [[ii[1] for ii in item] for item in ts], [[ii[2] for ii in item] for item in ts]) data = { "uid": uid, "iid": iid, "label": label, "trajectory": traj, "feedback": feedbacks, "target_index": target_index } loss = self.agent["model"].optimize_model(self.agent["sess"], data) log.info("average max_next_q value", np.mean(next_max_q)) return loss
def init_training(self): log.info("load environment") self.training_agent = self.config.training_agent(self.config) self.env = diversity_environments(self.config) self.data_set = [] self.collecting_training_data()
def update_rec_agent(self): for i in range(1): self.collecting_training_data(self.epoch) loss = self.rec_agent.update_model(self.data_set) log.info("rec agent", i, 'loss', loss) self.epoch += 1