class ACCovWorker: def __init__(self, name, trainer, model_path, global_episodes, global_rewards): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_rewards = global_rewards self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_coverages = [] self.episode_lengths = [] self.episode_mean_values = [] self.batcher = Batcher() self.summary_writer = tf.summary.FileWriter("logs/train_" + str(self.number)) with open('vectors_cov.pkl', 'rb') as fh: self.embeddings, self.embed_lookup = pickle.load(fh) self.num_feats = len(self.embeddings[0]) # Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_AC = AConv_Network(self.name, trainer, self.num_feats) self.update_local_ops = update_target_graph('global', self.name) self.avgFunctions = {} self.sleep_time = 0.028 self.env = Enviroment() def train(self, global_AC, rollout, sess, gamma, bootstrap_value): rollout = np.array(rollout) self.batcher.pad(rollout[:, 0], self.num_feats) self.batcher.init_child() self.batcher.pad_child(rollout[:, 1]) nodes_observations = rollout[:, 0] children_observations = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] values = rollout[:, 5] # Here we take the rewards and values from the rollout, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(self.rewards_plus, gamma)[:-1] self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] advantages = discount(advantages, gamma) # Update the global network using gradients from loss # Generate network statistics to periodically save rnn_state = self.local_AC.state_init feed_dict = {self.local_AC.target_v: discounted_rewards, self.local_AC.nodes: np.vstack(nodes_observations), self.local_AC.children: np.vstack(children_observations), self.local_AC.actions: actions, self.local_AC.advantages: advantages, self.local_AC.state_in[0]: self.batch_rnn_state[0], self.local_AC.state_in[1]: self.batch_rnn_state[1]} v_l, p_l, e_l, g_n, v_n, adv, apl_g, self.batch_rnn_state = sess.run([self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.adv_sum, self.local_AC.apply_grads, self.local_AC.state_out], feed_dict=feed_dict) return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n, adv / len(rollout) def work(self, max_episode_length, gamma, global_AC, sess, coord, saver): episode_count = sess.run(self.global_episodes) total_steps = 0 print("Starting worker " + str(self.number)) with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) self.env.reset() self.batcher = Batcher() episode_buffer = [] episode_values = [] episode_reward = 0 episode_coverage = 0 episode_step_count = 0 m = 0 rnn_state = self.local_AC.state_init self.batch_rnn_state = rnn_state while m < len(self.env.listOfFiles): m += 1 self.env.prepareNextFileConvWithCov(self.number) self.env.currentNumOfTable = 0 while self.env.currentNumOfTable < 1: totalRows = 0 for table in self.env.listOfTables: totalRows += len(table) self.env.startTable() self.env.currentNumOfRow = 0 for currentRow in self.env.listOfTableVectors[:1]: numOfTimes = 0 d = False self.env.initializeArgumentValuesCov() complexity = getNumOfReasonableNodes(currentRow) #complexity = int(complexity ** (1 / 3) * (len(self.env.listOfTables[0]) ** (1 / 3))) complexity = int(complexity ** (1 / 3)) if complexity == 0: complexity = 1 total = len(self.env.arguments) * totalRows * complexity while numOfTimes < total: if d: break # self.env.initializeArgumentValues() batches = list( enumerate( batch_samples(gen_samples(currentRow, self.embeddings, self.embed_lookup), 1))) iterator = iter(batches) batch = next(iterator, None) while batch is not None: # self.env.initializeArgumentValues() if isinstance(batch[0], int): num, batch = batch nodes, children = batch self.batcher.checkMaxDim(nodes) a_dist, v, rnn_state = sess.run( [self.local_AC.policy, self.local_AC.value, self.local_AC.state_out], feed_dict={self.local_AC.nodes: nodes, self.local_AC.children: children, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]}) a = np.random.choice(a_dist[0], p=a_dist[0]) a = np.argmax(a_dist == a) r, d, c, _ = self.env.step_cov(a, self.number) # nextBatch = next(iterator, None) total_steps += 1 episode_step_count += 1 batch = next(iterator, None) # batch = nextBatch episode_buffer.append([nodes, children, a, r, d, v[0, 0]]) if d or numOfTimes + 1 == total: # Since we don't know what the true final return is, we "bootstrap" from our current # value estimation. v1 = sess.run(self.local_AC.value, feed_dict={self.local_AC.nodes: nodes, self.local_AC.children: children, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]})[0, 0] v_l, p_l, e_l, g_n, v_n, adv = self.train(global_AC, episode_buffer, sess, gamma, v1) episode_buffer = [] sess.run(self.update_local_ops) # if episode_step_count >= max_episode_length - 1 or d or nextBatch is None: if numOfTimes + 1 == total or d: episode_reward += r episode_coverage += c if self.env.rootTreeAdtNode.name not in self.avgFunctions: self.avgFunctions[self.env.rootTreeAdtNode.name] = [c] else: self.avgFunctions[self.env.rootTreeAdtNode.name].append(c) break numOfTimes += 1 print( "Worker: " + str(self.number) + ", with number of times: " + str( numOfTimes) + ", for file: " + str(m)) print("Worker: " + str(self.number) + ", with number of episodes: " + str(episode_count)) episode_count += 1 self.episode_rewards.append(episode_reward) self.episode_coverages.append(episode_coverage) self.episode_lengths.append(episode_step_count) self.episode_mean_values.append(np.mean(episode_values)) # Update the network using the experience buffer at the end of the episode. if len(episode_buffer) != 0: v_l, p_l, e_l, g_n, v_n, adv = self.train(global_AC, episode_buffer, sess, gamma, 0.0) # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 2 == 0 and episode_count != 0: if episode_count % 100 == 0 and self.name == 'worker_0': saver.save(sess, self.model_path + '/model-' + str(episode_count) + '.cptk') print("Saved Model") mean_reward = np.mean(self.episode_rewards[-2:]) mean_length = np.mean(self.episode_lengths[-2:]) mean_value = np.mean(self.episode_mean_values[-2:]) mean_coverage = np.mean(self.episode_coverages[-2:]) summary = tf.Summary() summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) summary.value.add(tag='Perf/Coverage', simple_value=float(mean_coverage)) summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) summary.value.add(tag='Losses/Advantage', simple_value=float(adv)) summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n)) summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n)) #for key in self.env.dict_of_max_r.keys(): # summary.value.add(tag='Functions/Max coverage for function: ' + str(key), # simple_value=float(self.env.dict_of_max_r[key])) for key in self.avgFunctions.keys(): summary.value.add(tag='Avg Functions/Avg coverage for function: ' + str(key), simple_value=float(np.mean(self.avgFunctions[key][-2:]))) self.summary_writer.add_summary(summary, episode_count) self.summary_writer.flush() sess.run(self.increment)
class ACCovContWorker(object): def __init__(self, name, globalAC, sess, global_rewards, global_episodes, model_path): self.number = str(name) self.summary_writer = tf.summary.FileWriter("logs/train_" + str(name)) self.name = "worker_" + str(name) self.global_rewards = global_rewards self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.model_path = model_path with open('vectors_cov.pkl', 'rb') as fh: self.embeddings, self.embed_lookup = pickle.load(fh) self.num_feats = len(self.embeddings[0]) self.AC = ACNet(self.name, sess, self.num_feats, globalAC) # create ACNet for each worker self.sess = sess self.episode_rewards = [] self.episode_coverages = [] self.episode_lengths = [] self.a_loss = [] self.c_loss = [] self.batcher = Batcher() self.env = Enviroment() self.avgFunctions = {} def getCovVector(self, c): if self.env.argumentChangedVal % len(list(self.env.arguments.keys( ))) == 0 and self.env.argumentChangedVal != 0: argColVal = (self.env.argumentColumnValue + 1) % len( self.env.listOfTables[0]) keyOfArg = (self.env.argumentChangedVal + 1) % len( list(self.env.arguments.keys())) else: argColVal = (self.env.argumentColumnValue) % len( self.env.listOfTables[0]) keyOfArg = (self.env.argumentChangedVal) % len( list(self.env.arguments.keys())) return [argColVal, keyOfArg, c] def work(self, sess, coord, saver): episode_count = sess.run(self.global_episodes) total_step = 1 buffer_s, buffer_a = [], [] print("Starting " + str(self.name)) with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): self.env.reset() self.batcher = Batcher() episode_buffer = [] buffer_r = [] buffer_v_target = [] episode_reward = 0 episode_coverage = 0 episode_step_count = 0 m = 0 rnn_state = self.AC.state_init self.batch_rnn_state = rnn_state while m < len(self.env.listOfFiles): m += 1 self.env.prepareNextFileConvWithCov(self.number) self.env.currentNumOfTable = 0 while self.env.currentNumOfTable < 1: totalRows = 0 for table in self.env.listOfTables: totalRows += len(table) self.env.startTable() self.env.currentNumOfRow = 0 for currentRow in self.env.listOfTableVectors[:1]: numOfTimes = 0 d = False self.env.initializeArgumentValuesCov() complexity = getNumOfReasonableNodes(currentRow) #complexity = int(complexity ** (1/3) * (len(self.env.listOfTables[0]) ** (1/3))) complexity = int(complexity**(1 / 1)) c = 0 if complexity == 0: complexity = 1 total = len( self.env.arguments) * totalRows * complexity while numOfTimes < total: if d: break # self.env.initializeArgumentValues() batches = list( enumerate( batch_samples( gen_samples( currentRow, self.embeddings, self.embed_lookup), 1))) iterator = iter(batches) batch = next(iterator, None) while batch is not None: # self.env.initializeArgumentValues() if isinstance(batch[0], int): num, batch = batch nodes, children = batch self.batcher.checkMaxDim(nodes) vectorMatrixWithCov = [ self.getCovVector(c) ] a, rnn_state = self.AC.choose_action( nodes, children, rnn_state, vectorMatrixWithCov) self.env.step_cov_continuos_without_reward( a, self.number) episode_buffer.append([ nodes, children, a, vectorMatrixWithCov ]) #buffer_r.append(r) if len(episode_buffer) % ( len(self.env.arguments) * totalRows ) == 0 and len(episode_buffer) != 0: r, d, c, _ = self.env.step_cov_continuos_entire_matrix( self.number) temp = 0 while temp < len(self.env.arguments ) * totalRows: buffer_r.append(r) temp += 1 #nextBatch = next(iterator, None) total_step += 1 episode_step_count += 1 batch = next(iterator, None) if d or numOfTimes + 1 == total: # Since we don't know what the true final return is, we "bootstrap" from our current # value estimation. if d: v_s_ = 0 # terminal else: v_s_ = self.sess.run( self.AC.v, { self.AC.nodes: nodes, self.AC.children: children, self.AC.matrixWithCov: vectorMatrixWithCov, self.AC.state_in[0]: rnn_state[0], self.AC.state_in[1]: rnn_state[1] })[0, 0] buffer_v_target = [] rollout = np.array(episode_buffer) self.batcher.pad( rollout[:, 0], self.num_feats) self.batcher.init_child() self.batcher.pad_child(rollout[:, 1]) for r in buffer_r[:: -1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_c, buffer_v_target, buffer_matrix_cov = np.vstack( rollout[:, 0] ), np.vstack(rollout[:, 2]), np.vstack( rollout[:, 1]), np.vstack( buffer_v_target), np.vstack( rollout[:, 3]) feed_dict = { self.AC.nodes: buffer_s, self.AC.children: buffer_c, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, self.AC.state_in[0]: self.batch_rnn_state[0], self.AC.state_in[1]: self.batch_rnn_state[1], self.AC.matrixWithCov: buffer_matrix_cov } _, _, self.batch_rnn_state, a_loss, c_loss = self.AC.update_global( feed_dict ) # actual training step, update global ACNet self.a_loss.append(a_loss) self.c_loss.append(c_loss) buffer_s, buffer_a, buffer_r, buffer_c, buffer_matrix_cov = [], [], [], [], [] episode_buffer = [] self.AC.pull_global( ) # get global parameters to local ACNet # if episode_step_count >= max_episode_length - 1 or d or nextBatch is None: if numOfTimes + 1 == total or d: episode_reward += r episode_coverage += c if self.env.rootTreeAdtNode.name not in self.avgFunctions: self.avgFunctions[ self.env.rootTreeAdtNode. name] = [c] else: self.avgFunctions[ self.env.rootTreeAdtNode. name].append(c) break numOfTimes += 1 print("Worker: " + str(self.number) + ", with number of times: " + str(numOfTimes) + ", for file: " + str(m)) episode_count += 1 print("Worker: " + str(self.number) + ", with number of episodes: " + str(episode_count)) self.episode_rewards.append(episode_reward) self.episode_coverages.append(episode_coverage) self.episode_lengths.append(episode_step_count) if episode_count % 2 == 0 and episode_count != 0: if episode_count % 100 == 0 and self.name == 'worker_0': saver.save( sess, self.model_path + '/model-' + str(episode_count) + '.cptk') print("Saved Model") mean_reward = np.mean(self.episode_rewards[-2:]) mean_length = np.mean(self.episode_lengths[-2:]) mean_coverage = np.mean(self.episode_coverages[-2:]) mean_a_loss = np.mean(self.a_loss[-2:]) mean_c_loss = np.mean(self.c_loss[-2:]) summary = tf.Summary() summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) summary.value.add(tag='Perf/Coverage', simple_value=float(mean_coverage)) summary.value.add(tag='Loss/A_Loss', simple_value=float(mean_a_loss)) summary.value.add(tag='Loss/C_loss', simple_value=float(mean_c_loss)) for key in self.env.dict_of_max_r.keys(): summary.value.add( tag='Max Functions/Max coverage for function: ' + str(key), simple_value=float(self.env.dict_of_max_r[key])) for key in self.avgFunctions.keys(): summary.value.add( tag='Avg Functions/Avg coverage for function: ' + str(key), simple_value=float( np.mean(self.avgFunctions[key][-2:]))) self.summary_writer.add_summary(summary, episode_count) self.summary_writer.flush() sess.run(self.increment)