def fit(self, dataset, num_epochs=1): qstates = self.preprocessBatching(dataset["qstates"]) qvalues = self.preprocessBatching(dataset["qvalues"]) idx = list(range(qstates.shape[0])) random.shuffle(idx) num_batches = len(idx) // self.batch_size X = nd.array(qstates[idx], ctx=self.ctx)\ .reshape([num_batches, self.batch_size] + list(qstates.shape[1:])) Y = nd.array(qvalues[idx], ctx=self.ctx)\ .reshape([num_batches, self.batch_size] + list(qvalues.shape[1:])) alpha = self.alpha progressBar = ProgressBar(maxval=num_epochs) for e in range(num_epochs): if self.progbar: progressBar.printProgress(e, prefix="Training Q(s,a)", suffix="%s / %s" % (e + 1, num_epochs)) if ((e + 1) % 100) == 0: # 100 epoch alpha decay alpha = alpha / 2.0 for i in range(num_batches): with autograd.record(): outputs, hidden_states = self.nn(X[i]) loss = self.mean_loss(outputs, Y[i]) loss.backward() self.optimizer(self.params, alpha) # Keep a moving average of the losses if (i == 0) and (e == 0): self.moving_loss = np.mean(loss.asnumpy()[0]) else: self.moving_loss = 0.99 * self.moving_loss + 0.01 * \ np.mean(loss.asnumpy()[0])
def main(): # define arguments parser = argparse.ArgumentParser() parser.add_argument("--render", action="store_true", help="Render the state") parser.add_argument("--render_interval", type=int, default=10, help="Number of rollouts to skip before rendering") parser.add_argument("--num_rollouts", type=int, default=-1, help="Number of max rollouts") parser.add_argument("--logfile", type=str, help="Indicate where to save rollout data") parser.add_argument( "--load_params", type=str, help="Load previously learned parameters from [LOAD_PARAMS]") parser.add_argument("--save_params", type=str, help="Save learned parameters to [SAVE_PARAMS]") parser.add_argument("--silent", action="store_true", help="Suppress print of the DQN config") parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") parser.add_argument("--epsilon", type=float, default=0.1, help="Random factor (for Epsilon-greedy)") parser.add_argument("--eps_anneal", type=int, default=0, help="The amount of episodes to anneal epsilon by") parser.add_argument("--sample_size", type=int, default=256, help="Number of samples from the dataset per episode") parser.add_argument("--num_epochs", type=int, default=50, help="Number of epochs to run per episode") parser.add_argument("--episode_length", type=int, default=128, help="Number of rollouts per episode") parser.add_argument("--noise", type=float, help="Amount of noise to add to the actions") parser.add_argument("--test", action="store_true", help="Test the params") args = parser.parse_args() signal.signal(signal.SIGINT, stopsigCallback) global stopsig # create the basketball environment env = BasketballVelocityEnv(fps=60.0, timeInterval=1.0, goal=[0, 5, 0], initialLengths=np.array([0, 0, 1, 1, 1, 0, 1]), initialAngles=np.array( [0, 45, -20, -20, 0, -20, 0])) # create space stateSpace = ContinuousSpace(ranges=env.state_range()) actionSpace = DiscreteSpace(intervals=[25 for i in range(7)] + [1], ranges=env.action_range()) processor = DQNProcessor(actionSpace) # create the model and policy functions modelFn = DQNNetwork( sizes=[stateSpace.n + actionSpace.n, 128, 256, 256, 128, 1], alpha=0.001, use_gpu=True, momentum=0.9) if args.load_params: print("Loading params...") modelFn.load_params(args.load_params) allActions = actionSpace.sampleAll() policyFn = EpsilonGreedyPolicy( epsilon=args.epsilon if not args.test else 0, getActionsFn=lambda state: allActions, distributionFn=lambda qstate: modelFn(qstate), processor=processor) replayBuffer = RingBuffer(max_limit=2048) if args.logfile: log = open(args.logfile, "a") if not args.silent: print("Env space range:", env.state_range()) print("Env action range:", env.action_range()) print("State space:", stateSpace.n) print("Action space:", actionSpace.n) print("Action space bins:", actionSpace.bins) print("Epsilon:", args.epsilon) print("Epsilon anneal episodes:", args.eps_anneal) print("Gamma:", args.gamma) __actionShape = policyFn.getActions(None).shape totalActions = np.prod(actionSpace.bins) print("Actions are sampled:", __actionShape[0] != totalActions) print("Number of actions:", totalActions) rollout = 0 if not args.silent and not args.test: iterationBar = ProgressBar(maxval=args.episode_length) while args.num_rollouts == -1 or rollout < args.num_rollouts: if stopsig: break if not args.silent and not args.test: iterationBar.printProgress(rollout % args.episode_length, prefix="Query(s,a,s',r)", suffix="epsilon: " + str(policyFn.epsilon)) state = env.reset() reward = 0 done = False steps = 0 while not done and steps < 5: # 5 step max action = policyFn(state) if steps == 4: # throw immediately action[-2] = 0 action[-1] = 1 envAction = processor.process_env_action(action) if args.noise: envAction[:7] += np.random.normal(scale=np.ones([7]) * args.noise) nextState, reward, done, info = env.step(envAction) replayBuffer.append([state, action, nextState, reward, done]) if args.test and done: print("Reward:", reward) state = nextState steps += 1 if args.render and (rollout + 1) % args.render_interval == 0: env.render() rollout += 1 if args.eps_anneal > 0: # linear anneal epsilon_diff = args.epsilon - min(0.1, args.epsilon) policyFn.epsilon = args.epsilon - min(rollout, args.eps_anneal) / \ float(args.eps_anneal) * epsilon_diff if rollout % args.episode_length == 0 and not args.test: dataset = replayBuffer.sample(args.sample_size) states = np.array([d[0] for d in dataset]) actions = np.array([d[1] for d in dataset]) nextStates = [d[2] for d in dataset] rewards = np.array([[d[3]] for d in dataset]) # rewards require extra [] terminal = [d[4] for d in dataset] QS0 = processor.process_Qstate(states, actions) Q1 = np.zeros(rewards.shape, dtype=np.float32) if not args.silent: progressBar = ProgressBar(maxval=len(nextStates)) for i, nextState in enumerate(nextStates): if stopsig: break if not args.silent: progressBar.printProgress(i, prefix="Creating Q(s,a)", suffix="%s / %s" % (i + 1, len(nextStates))) if terminal[i]: continue # 0 dist = modelFn( processor.process_Qstate( repmat(nextState, allActions.shape[0], 1), allActions)) Q1[i, 0] = np.max(dist) # max[a' in A]Q(s', a') if stopsig: break Q0_ = rewards + args.gamma * Q1 modelFn.fit({ "qstates": QS0, "qvalues": Q0_ }, num_epochs=args.num_epochs) avgQ = np.sum(Q0_) / Q0_.shape[0] avgR = np.sum(rewards) / rewards.shape[0] print("Rollouts:", rollout, "Error:", modelFn.score(), "Average Q:", avgQ, "Average R:", avgR) print("") if args.logfile: log.write("[" + str(rollout) + ", " + str(modelFn.score()) + ", " + str(avgQ) + ", " + str(avgR) + "]\n") if args.logfile: log.close() if args.save_params: print("Saving params...") modelFn.save_params(args.save_params)