예제 #1
0
classifier_optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)

running_reward = 0
batch = []
labels = []
total_steps = 0
if args.mode == "train" or args.mode == "all":
  for i_episode in count(1000):
    observation = env.reset()
    print("episode: ", i_episode)
    for t in range(1000):
      action = select_action(observation,env.action_space_n(),args.epsilon)
      observation, reward, done, info = env.step(action)
      model.rewards.append(reward)
      
      if env.is_touching():
        print("touching!")
        #print("batch size", len(batch))
        if len(batch) > args.batch_size:
          #TODO GPU support
          #batch = torch.from_numpy(np.asarray(batch))
          batch = torch.LongTensor(torch.from_numpy(np.asarray(batch)))
          labels = torch.from_numpy(np.asarray(labels))
          #labels = torch.LongTensor(torch.from_numpy(np.asarray(labels)))
          if args.gpu and torch.cuda.is_available():
            batch = batch.cuda()
            labels = labels.cuda()
          batch = Variable(batch)
          labels = Variable(labels)
          classifier_optimizer.zero_grad()
          outputs = cnn(batch)
예제 #2
0
            done = False
            while not done:

                action = RL.choose_action(observation)
                observation_, reward, done, info = env.step(action)

                # something to consider - should we modify the reward if it's the terminal state and
                # we haven't touched yet? Massive penalty for finishing the round with no touch
                RL.store_transition(observation, action, reward, observation_)

                ep_r[i_episode] += reward

                if total_steps > 1000:
                    cost = RL.learn()

                if env.is_touching():
                    print('\ntouching at step', env.steps, 'total reward is ',
                          ep_r[i_episode])
                    games_where_touched += 1
                    cnn_features_TD[TD_cnt] = observation_
                    cnn_labels_TD[TD_cnt] = env.class_label
                    TD_cnt += 1
                    ep_touch[i_episode] += 1

                if (env.steps % 500 == 0):
                    print('\nepisode: ', i_episode + 1, 'step: ', env.steps,
                          'episode reward ', ep_r[i_episode])

                observation = observation_
                total_steps += 1