Пример #1
0
def test(policy_net, validation_traces, logger, step, tb_logger):
	val_tic = time.time()
	tag_prefix = "Central "
	try:
		if pm.TRAINING_MODE == "SL":
			val_loss = validate.val_loss(policy_net, copy.deepcopy(validation_traces), logger, step)
			tb_logger.add_scalar(tag=tag_prefix + "Val Loss", value=val_loss, step=step)
		jct, makespan, reward = validate.val_jmr(policy_net, copy.deepcopy(validation_traces), logger, step, tb_logger)
		tb_logger.add_scalar(tag=tag_prefix + "Val JCT", value=jct, step=step)
		tb_logger.add_scalar(tag=tag_prefix + "Val Makespan", value=makespan, step=step)
		tb_logger.add_scalar(tag=tag_prefix + "Val Reward", value=reward, step=step)
		tb_logger.flush()
		val_toc = time.time()
		logger.info("Central Agent:" + " Validation at step " + str(step) + " Time: " + '%.3f' % (val_toc - val_tic))

		# log results
		if pm.TRAINING_MODE == "SL":
			f = open(LOG_DIR + "sl_validation.txt", 'a')
		else:
			f = open(LOG_DIR + "rl_validation.txt", 'a')
		f.write("step " + str(step) + ": " + str(jct) + " " + str(makespan) + " " + str(reward) + "\n")
		f.close()

		return (jct, makespan, reward)
	except Exception as e:
		logger.error("Error when validation! " + str(e))
		tb_logger.add_text(tag="validation error", value=str(e), step=step)
Пример #2
0
def rl_agent(net_weights_q, net_gradients_q, stats_q, id):
	logger = log.getLogger(name="agent_"+str(id), level=pm.LOG_MODE,mode="w",fh=True,ch=True,prefix="Agent " +str(id))
	logger.info("Start reinforcement learning, agent " + str(id) + " ...")

	if not pm.RANDOMNESS:
		np.random.seed(pm.np_seed+id+1)

	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess, tf.device("/gpu:"+str(id%2)):
		policy_net = network.PolicyNetwork(sess, "policy_net", pm.TRAINING_MODE, logger)
		if pm.VALUE_NET:
			value_net = network.ValueNetwork(sess, "value_net", pm.TRAINING_MODE, logger)
		sess.run(tf.global_variables_initializer())  # to avoid batch normalization error
		if pm.VALUE_NET:
			policy_weights, value_weights = net_weights_q.get()
			value_net.set_weights(value_weights)
		else:
			policy_weights = net_weights_q.get()
		policy_net.set_weights(policy_weights) # initialization from master
		first_time = True

		global_step = 1
		if not pm.VAL_ON_MASTER:
			validation_traces = []
			for i in range(pm.VAL_DATASET):
				validation_traces.append(trace.Trace(None).get_trace())
		if pm.PRIORITY_REPLAY:
			mem_store = prioritized_memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE)
		else:
			mem_store = memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE)
		logger.info("Filling experience buffer...")

		# generate training data
		traces = []
		for episode in range(pm.TRAIN_EPOCH_SIZE):
			job_trace = trace.Trace(None).get_trace()
			traces.append(job_trace)

		if pm.EPSILON_GREEDY:
			if pm.VARYING_EPSILON:
				temperature = pm.ANNEALING_TEMPERATURE * (1 + float(id)/pm.NUM_AGENTS)
			else:
				temperature = pm.ANNEALING_TEMPERATURE
		gates = [True, True, True]
		for epoch in range(pm.TOT_TRAIN_EPOCHS):
			for episode in range(pm.TRAIN_EPOCH_SIZE):
				if pm.CHANGING_JOB_TYPES:
					if global_step >= 0 and gates[0]:
						gates[0] = False
						traces = []
						for episode in range(pm.TRAIN_EPOCH_SIZE):
							job_trace = trace.Trace(None).get_trace(4)
							traces.append(job_trace)
						logger.info("Changing job types 4")
					elif global_step >= 1000 and gates[1]:
						gates[1] = False
						traces = []
						for episode in range(pm.TRAIN_EPOCH_SIZE):
							job_trace = trace.Trace(None).get_trace(6)
							traces.append(job_trace)
						logger.info("Changing job types 6")
					elif global_step >= 2000 and gates[2]:
						gates[2] = False
						traces = []
						for episode in range(pm.TRAIN_EPOCH_SIZE):
							job_trace = trace.Trace(None).get_trace(8)
							traces.append(job_trace)
						logger.info("Changing job types 8")
				tic = time.time()
				if mem_store.full() and pm.ENABLE_K8S:
					logger.info("Switching to k8s environment!!!")
					env = k8s_rl_env.K8S_RL_Env("RL", copy.deepcopy(traces[episode]), logger)
				else:
					env = rl_env.RL_Env("RL", copy.deepcopy(traces[episode]), logger)
				states = []
				masked_outputs = []
				actions = []
				rewards = []
				ts = 0
				while not env.end:
					if pm.LOG_MODE == "DEBUG":
						time.sleep(0.01)
					state = env.observe()
					output = policy_net.predict(np.reshape(state, (1, pm.STATE_DIM[0], pm.STATE_DIM[1])))
					if pm.EPSILON_GREEDY: # greedy epsilon
						env.epsilon = 2 / (1 + np.exp(global_step / temperature))
					masked_output, action, reward, move_on, valid_state = env.step(output)

					if valid_state: # do not save state when move on except skip_ts, but need to save reward!!!
						states.append(state)
						masked_outputs.append(masked_output)
						actions.append(action)
						rewards.append(reward)
					if move_on:
						ts += 1
						# ts_reward = reward
						if ts%pm.LT_REWARD_NUM_TS == 0 and len(states) > 0: # states can be [] due to no jobs in the ts
							# lt_reward = sum(rewards)
							# ts_rewards = [0 for _ in range(pm.LT_REWARD_NUM_TS)]
							# ts_rewards[-1] = lt_reward
							# for i in reversed(range(0, len(ts_rewards) - 1)):
							# 	ts_rewards[i] += ts_rewards[i + 1] * pm.DISCOUNT_FACTOR

							if pm.LT_REWARD_IN_TS:
								for i in reversed(range(0,len(rewards)-1)):
									rewards[i] += rewards[i+1]*pm.DISCOUNT_FACTOR
							elif pm.TS_REWARD_PLUS_JOB_REWARD:
								rewards = env.get_job_reward()
								assert len(rewards) == len(states)
							else:
								rewards = [reward for _ in range(len(states))]

							# randomly fill samples to memory
							if pm.RANDOM_FILL_MEMORY:
								indexes = np.random.choice(len(states), size=pm.MINI_BATCH_SIZE, replace=False)
								for i in indexes:
									mem_store.store(states[i], masked_outputs[i], actions[i], rewards[i])
							else:
								for i in range(len(states)):
									mem_store.store(states[i], masked_outputs[i], actions[i], rewards[i])

							if mem_store.full() and ts%pm.NUM_TS_PER_UPDATE == 0:
								# prepare a training batch
								mem_indexes, trajectories, IS_weights = mem_store.sample(pm.MINI_BATCH_SIZE)
								states_batch = [traj.state for traj in trajectories]
								outputs_batch = [traj.output for traj in trajectories]
								actions_batch = [traj.action for traj in trajectories]
								rewards_batch = [traj.reward for traj in trajectories]

								# pull latest weights before training
								if not first_time: # avoid pulling twice at the first update
									if pm.VALUE_NET:
										policy_weights, value_weights = net_weights_q.get()
										if isinstance(policy_weights, basestring) and policy_weights == "exit":
											logger.info("Agent " + str(id) + " exits.")
											exit(0)
										policy_net.set_weights(policy_weights)
										value_net.set_weights(value_weights)
									else:
										policy_weights = net_weights_q.get()
										if isinstance(policy_weights, basestring) and policy_weights == "exit":
											logger.info("Agent " + str(id) + " exits.")
											exit(0)
										policy_net.set_weights(policy_weights)
								else:
									first_time = False

								# set entropy weight, both agent and central agent need to be set
								policy_net.anneal_entropy_weight(global_step)

								# reinforcement learning to calculate gradients
								if pm.VALUE_NET:
									value_output = value_net.predict(np.stack(states_batch))
									td_loss = np.vstack(rewards_batch) - value_output
									adjusted_td_loss = td_loss * np.vstack(IS_weights)
									policy_entropy, policy_loss, policy_grads = policy_net.get_rl_gradients(np.stack(states_batch), \
													np.vstack(outputs_batch), np.vstack(actions_batch), adjusted_td_loss)
									value_loss, value_grads = value_net.get_rl_gradients(np.stack(states_batch), value_output, np.vstack(rewards_batch))
								else:
									if pm.PRIORITY_MEMORY_SORT_REWARD and pm.MEAN_REWARD_BASELINE:
										td_loss = np.vstack(rewards_batch) - mem_store.avg_reward()
									else:
										td_loss = np.vstack(rewards_batch) - 0
									adjusted_td_loss = td_loss * np.vstack(IS_weights)
									policy_entropy, policy_loss, policy_grads = policy_net.get_rl_gradients(np.stack(states_batch), np.vstack(outputs_batch), np.vstack(actions_batch), adjusted_td_loss)

								for aa in range(len(actions_batch)):
									if actions_batch[aa][-1] == 1:
										# print "rewards:", rewards_batch[aa], "td_loss:", td_loss[aa]
										logger.debug("rewards:" + str(rewards_batch[aa]) + "td_loss:" + str(td_loss[aa]))

								for i in range(len(policy_grads)):
									try:
										assert np.any(np.isnan(policy_grads[i])) == False
										# print np.mean(np.abs(policy_grads[i])) # 10^-5 to 10^-2
									except Exception as e:
										logger.error("Error: " + str(e))
										logger.error("Gradients: " + str(policy_grads[i]))
										logger.error("Input type: " + str(states_batch[:,0]))
										logger.error("Masked Output: " + str(outputs_batch))
										logger.error("Action: " + str(actions_batch))
										logger.error("TD Loss: " + str(td_loss))
										logger.error("Policy Loss: " + str(policy_loss))
										logger.error("Policy Entropy: " + str(policy_entropy))
										exit(1) # another option is to continue
								if pm.VALUE_NET:
									for i in range(len(value_grads)):
										try:
											assert np.any(np.isnan(value_grads[i])) == False
										except Exception as e:
											logger.error("Error: " + str(e) + " " + str(policy_grads[i]))
											exit(1)

								# send gradients to the central agent
								if pm.VALUE_NET:
									net_gradients_q.put((policy_grads, value_grads))
								else:
									net_gradients_q.put(policy_grads)
								if pm.PRIORITY_REPLAY:
									mem_store.update(mem_indexes, abs(td_loss))
								# validation
								if not pm.VAL_ON_MASTER and global_step % pm.VAL_INTERVAL == 0:
									val_loss = validate.val_loss(policy_net, validation_traces, logger, global_step)
									jct, makespan, reward = validate.val_jmr(policy_net, validation_traces, logger,
																			 global_step)
									stats_q.put(("val", val_loss, jct, makespan, reward))

								# statistics
								if pm.VALUE_NET:
									stats_q.put(("step:policy+value", policy_entropy, policy_loss, value_loss, sum(td_loss)/len(td_loss), sum(rewards_batch)/len(rewards_batch), output))
								else:
									stats_q.put(("step:policy", policy_entropy, policy_loss, sum(td_loss)/len(td_loss), sum(rewards_batch)/len(rewards_batch), output))
								global_step += 1

							# clear
							states = []
							masked_outputs = []
							actions = []
							rewards = []

				# collect statistics after training one trace
				num_jobs, jct, makespan, reward = env.get_results()
				stats_q.put(("trace:sched_result", jct, makespan, reward))
				if (epoch*pm.TRAIN_EPOCH_SIZE+episode)%pm.DISP_INTERVAL == 0:
					if (epoch*pm.TRAIN_EPOCH_SIZE+episode)%50 == 0:
						stats_q.put(("trace:job_stats", episode, env.get_jobstats()))
					toc = time.time()
					logger.info("--------------------------------------------------------------")
					logger.info("Agent " + str(id) + " Epoch " + str(epoch) + " Trace " + str(episode) + " Step " + str(global_step))
					logger.info("# of Jobs\t AVG JCT\t Makespan\t Reward\t Time")
					logger.info(str(num_jobs) + " \t" + " \t" + " " + '%.3f' %jct + " \t\t" + " " + '%.3f' %makespan \
								+ "\t\t" + " " + '%.3f' %reward + "\t" + " " + '%.3f' % (toc - tic))
Пример #3
0
def sl_agent(net_weights_q, net_gradients_q, stats_q, id):
	logger = log.getLogger(name="agent_"+str(id), level=pm.LOG_MODE)
	logger.info("Start supervised learning, agent " + str(id) + " ...")

	if not pm.RANDOMNESS:
		np.random.seed(pm.np_seed+id+1)

	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess, tf.device("/gpu:"+str(id%2)):
		policy_net = network.PolicyNetwork(sess, "policy_net", pm.TRAINING_MODE, logger)
		sess.run(tf.global_variables_initializer())  # to avoid batch normalization error

		global_step = 1
		avg_jct = []
		avg_makespan = []
		avg_reward = []
		if not pm.VAL_ON_MASTER:
			validation_traces = []  # validation traces
			for i in range(pm.VAL_DATASET):
				validation_traces.append(trace.Trace(None).get_trace())
		# generate training traces
		traces = []
		for episode in range(pm.TRAIN_EPOCH_SIZE):
			job_trace = trace.Trace(None).get_trace()
			traces.append(job_trace)
		mem_store = memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE)
		logger.info("Filling experience buffer...")
		for epoch in range(pm.TOT_TRAIN_EPOCHS):
			for episode in range(pm.TRAIN_EPOCH_SIZE):
				tic = time.time()
				job_trace = copy.deepcopy(traces[episode])
				if pm.HEURISTIC == "DRF":
					env = drf_env.DRF_Env("DRF", job_trace, logger)
				elif pm.HEURISTIC == "FIFO":
					env = fifo_env.FIFO_Env("FIFO", job_trace, logger)
				elif pm.HEURISTIC == "SRTF":
					env = srtf_env.SRTF_Env("SRTF", job_trace, logger)
				elif pm.HEURISTIC == "Tetris":
					env = tetris_env.Tetris_Env("Tetris", job_trace, logger)

				while not env.end:
					if pm.LOG_MODE == "DEBUG":
						time.sleep(0.01)
					data = env.step()
					logger.debug("ts length:" + str(len(data)))

					for (input, label) in data:
						mem_store.store(input, 0, label, 0)

					if mem_store.full():
						# prepare a training batch
						_, trajectories, _ = mem_store.sample(pm.MINI_BATCH_SIZE)
						input_batch = [traj.state for traj in trajectories]
						label_batch = [traj.action for traj in trajectories]

						# if global_step % 10 == 0:
						# 	print "input", input_batch[0]
						# 	print "label", label_batch[0]

						# pull latest weights before training
						weights = net_weights_q.get()
						if isinstance(weights, basestring) and weights == "exit":
							logger.info("Agent " + str(id) + " exits.")
							exit(0)
						policy_net.set_weights(weights)

						# superversed learning to calculate gradients
						entropy, loss, policy_grads = policy_net.get_sl_gradients(np.stack(input_batch),np.vstack(label_batch))
						for i in range(len(policy_grads)):
							assert np.any(np.isnan(policy_grads[i])) == False

						# send gradients to the central agent
						net_gradients_q.put(policy_grads)

						# validation
						if not pm.VAL_ON_MASTER and global_step % pm.VAL_INTERVAL == 0:
							val_tic = time.time()
							val_loss = validate.val_loss(policy_net, validation_traces, logger, global_step)
							jct, makespan, reward = validate.val_jmr(policy_net, validation_traces, logger, global_step)
							stats_q.put(("val", val_loss, jct, makespan, reward))
							val_toc = time.time()
							logger.info("Agent " + str(id) + " Validation at step " + str(global_step) + " Time: " + '%.3f'%(val_toc-val_tic))
						stats_q.put(("step:sl", entropy, loss))

						global_step += 1

				num_jobs, jct, makespan, reward = env.get_results()
				avg_jct.append(jct)
				avg_makespan.append(makespan)
				avg_reward.append(reward)
				if global_step%pm.DISP_INTERVAL == 0:
					logger.info("Agent\t AVG JCT\t Makespan\t Reward")
					logger.info(str(id) + " \t \t " + '%.3f' %(sum(avg_jct)/len(avg_jct)) + " \t\t" + " " + '%.3f' %(1.0*sum(avg_makespan)/len(avg_makespan)) \
								+ " \t" + " " + '%.3f' %(sum(avg_reward)/len(avg_reward)))