for (action, value), r in zip(saved_actions, rewards): reward = r - value.data[0,0] action.reinforce(reward) value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r]))) optimizer.zero_grad() final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions)) gradients = [torch.ones(1)] + [None] * len(saved_actions) autograd.backward(final_nodes, gradients) optimizer.step() del model.rewards[:] del model.saved_actions[:] #train env = SenseEnv(vars(args)) print("action space: ",env.action_space()) model = Policy(env.observation_space(),env.action_space_n()) cnn = CNN(env.classification_n()) if args.gpu and torch.cuda.is_available(): model.cuda() cnn.cuda() if args.model_path: if os.path.exists(args.model_path+"/model.pkl"): print("loading pretrained models") model.load_state_dict(torch.load(args.model_path+"/model.pkl")) cnn.load_state_dict(torch.load(args.model_path+"/cnn.pkl")) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) classifier_criterion = nn.CrossEntropyLoss() classifier_optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)
num_games = 20 game_length = 1000 e_greedy_inc = 0.05 / game_length # want to increase by 0.05 per game so we spend enough time exploring mem_size = num_games * game_length cnn_features_TD = np.zeros((num_games, 40000), dtype=np.int8) cnn_labels_TD = np.zeros(num_games, dtype=np.int8) cnn_features_ED = np.zeros((num_games, 40000), dtype=np.int8) cnn_labels_ED = np.zeros(num_games, dtype=np.int8) TD_cnt = 0 # counter to keep track of how many times we touch in the training phase RL = DeepQNetwork(n_actions=env.action_space_n(), n_features=env.observation_space(), learning_rate=0.1, e_greedy=0.9, replace_target_iter=100, memory_size=mem_size, e_greedy_increment=e_greedy_inc) if args.mode == "train" or args.mode == "all": games_where_touched = 0 total_steps = 0 ep_r = np.zeros(num_games) ep_touch = np.zeros(num_games) for i_episode in range(num_games):