classifier_optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001) running_reward = 0 batch = [] labels = [] total_steps = 0 if args.mode == "train" or args.mode == "all": for i_episode in count(1000): observation = env.reset() print("episode: ", i_episode) for t in range(1000): action = select_action(observation,env.action_space_n(),args.epsilon) observation, reward, done, info = env.step(action) model.rewards.append(reward) if env.is_touching(): print("touching!") #print("batch size", len(batch)) if len(batch) > args.batch_size: #TODO GPU support #batch = torch.from_numpy(np.asarray(batch)) batch = torch.LongTensor(torch.from_numpy(np.asarray(batch))) labels = torch.from_numpy(np.asarray(labels)) #labels = torch.LongTensor(torch.from_numpy(np.asarray(labels))) if args.gpu and torch.cuda.is_available(): batch = batch.cuda() labels = labels.cuda() batch = Variable(batch) labels = Variable(labels) classifier_optimizer.zero_grad() outputs = cnn(batch)
done = False while not done: action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # something to consider - should we modify the reward if it's the terminal state and # we haven't touched yet? Massive penalty for finishing the round with no touch RL.store_transition(observation, action, reward, observation_) ep_r[i_episode] += reward if total_steps > 1000: cost = RL.learn() if env.is_touching(): print('\ntouching at step', env.steps, 'total reward is ', ep_r[i_episode]) games_where_touched += 1 cnn_features_TD[TD_cnt] = observation_ cnn_labels_TD[TD_cnt] = env.class_label TD_cnt += 1 ep_touch[i_episode] += 1 if (env.steps % 500 == 0): print('\nepisode: ', i_episode + 1, 'step: ', env.steps, 'episode reward ', ep_r[i_episode]) observation = observation_ total_steps += 1