for player in range(1, 22): old_action = Policy[dealer - 1, player - 1] policy_improvement(env, dealer, player, Value_table) if (old_action != Policy[dealer - 1, player - 1]): policy_stable = False # print(Policy) if (policy_stable): break # 计算获胜的概率 win = 0 draw = 0 # 平局 win_rate = 0 for i in range(200000): env.reset() while (True): d, p = env.get_state() action = Policy[d - 1, p - 1] next, reward = env.step(action) if (next == 'terminal'): if (reward > 0): win += 1 if (reward == 0): draw += 1 win_rate = win / 200000 break print(epi, win, draw, win_rate) result.append(win_rate) f = open('result', 'a') # 打开test.txt 如果文件不存在,创建该文件。 f.write(
def train(net, rank): torch.set_num_threads(1) #also do: export MKL_NUM_THREADS=1 net.reset() env = Game(True, 4000 + rank + 1, max_steps=250) target_net = Net(1254, 6, 36) target_net.load_state_dict(net.state_dict()) target_net.reset() epsilon = epsilon1 optimizer = optim.RMSprop(net.parameters(), lr=learning_rate) last_save = time.time() last_notify = time.time() last_sync = time.time() episode_number = 0 terminal = True prev_value = None available_objects = None num_objects = len(env.objects) recent_rewards_of_episodes = [] recent_steps_of_episodes = [] quest1_reward_cnt = 0 quest2_reward_cnt = 0 quest3_reward_cnt = 0 quest4_reward_cnt = 0 quest1_rewards = np.zeros(100) quest2_rewards = np.zeros(100) quest3_rewards = np.zeros(100) quest4_rewards = np.zeros(100) if rank == 0: stats = [] while True: if terminal: student_saw_obelisk = False quest1_rewards[episode_number % len(quest1_rewards)] = 0 quest2_rewards[episode_number % len(quest2_rewards)] = 0 quest3_rewards[episode_number % len(quest3_rewards)] = 0 quest4_rewards[episode_number % len(quest4_rewards)] = 0 prev_value = None num_steps = 0 net.reset() target_net.reset() state, reward, terminal, available_objects = env.reset() sum_rewards = reward state = torch.LongTensor(state) objects_probs = net(Variable(state.unsqueeze(0))) _objects_probs = objects_probs.data.numpy() #Choose action if random.random() < epsilon: if available_objects is None: objects = list(enumerate(env.objects)) else: objects = [ _ for _ in list(enumerate(env.objects)) if _[0] in available_objects ] _object = random.choice(objects)[0] else: if available_objects is not None: mask = np.zeros(num_objects) for e in available_objects: mask[e] = 1 _objects_probs = objects_probs.data.numpy() * mask _objects_probs = _objects_probs + (_objects_probs == 0) * -1e30 _object = int(np.argmax(_objects_probs)) prev_value = objects_probs[0, _object] # step the environment and get new measurements state, reward, terminal, available_objects = env.step(5, _object) sum_rewards += reward num_steps += 1 if reward > 10 - 0.0001: quest4_reward_cnt = quest4_reward_cnt + 1 quest4_rewards[episode_number % len(quest4_rewards)] = 1 elif reward > 8 - 0.0001: quest3_reward_cnt = quest3_reward_cnt + 1 quest3_rewards[episode_number % len(quest3_rewards)] = 1 if not disable_curriculum: if not student_saw_obelisk: reward = -8 terminal = True elif reward > 7 - 0.0001: student_saw_obelisk = True quest2_reward_cnt = quest2_reward_cnt + 1 quest2_rewards[episode_number % len(quest2_rewards)] = 1 if not disable_curriculum: if np.mean(quest2_rewards) < 0.75 and random.random() < 0.9: terminal = True elif reward > 5 - 0.0001: quest1_reward_cnt = quest1_reward_cnt + 1 quest1_rewards[episode_number % len(quest1_rewards)] = 1 if not disable_curriculum: if np.mean(quest1_rewards) < 0.9 and random.random() < 0.85: terminal = True if 2 * epsilon > (epsilon1 + epsilon2): if np.mean(quest3_rewards) > .98: if np.mean(quest2_rewards) > .98: if np.mean(quest1_rewards) > .98: epsilon = epsilon2 if rank == 0: notify("Epsilon is now:" + str(epsilon)) if terminal: next_value = 0 else: if target_q_ts is None: next_value = float(np.max(_objects_probs)) else: state = torch.LongTensor(state) objects_probs = target_net(Variable(state.unsqueeze(0))) _objects_probs = objects_probs.data.numpy() if available_objects is not None: mask = np.zeros(num_objects) for e in available_objects: mask[e] = 1 _objects_probs = _objects_probs * mask _objects_probs = _objects_probs + (_objects_probs == 0) * -1e30 next_value = float(np.max(_objects_probs)) loss = (reward + gamma * next_value - prev_value)**2 #Update for only a tenth of the non important steps if abs(reward) > 4 or random.random() < 0.05: optimizer.zero_grad() loss.backward(retain_graph=True) nn.utils.clip_grad_norm(net.parameters(), 1) optimizer.step() if terminal: recent_rewards_of_episodes.append(sum_rewards) recent_steps_of_episodes.append(num_steps) if len(recent_rewards_of_episodes) > 100: recent_rewards_of_episodes.pop(0) if len(recent_steps_of_episodes) > 100: recent_steps_of_episodes.pop(0) episode_number += 1 if target_q_ts is not None and time.time( ) - last_sync > target_q_ts: if rank == 0: print("Update target") target_net.load_state_dict(net.state_dict()) last_sync = time.time() if rank == 0: stats.append({}) stats[-1]["episode_number"] = episode_number stats[-1]["sum_rewards"] = sum_rewards stats[-1]["num_steps"] = num_steps stats[-1]["mean_recent_rewards_of_episodes"] = np.mean( recent_rewards_of_episodes) stats[-1]["mean_recent_steps_of_episodes"] = np.mean( recent_steps_of_episodes) stats[-1]["quest1_reward_cnt"] = quest1_reward_cnt stats[-1]["quest2_reward_cnt"] = quest2_reward_cnt stats[-1]["quest3_reward_cnt"] = quest3_reward_cnt stats[-1]["quest4_reward_cnt"] = quest4_reward_cnt stats[-1]["mean_quest1_rewards"] = np.mean(quest1_rewards) stats[-1]["mean_quest2_rewards"] = np.mean(quest2_rewards) stats[-1]["mean_quest3_rewards"] = np.mean(quest3_rewards) stats[-1]["mean_quest4_rewards"] = np.mean(quest4_rewards) summary = "{} {:.4} {} {:.4} {:.4} Qc: {} {} {} {} Q: {} {} {} {}".format( episode_number, sum_rewards, num_steps, np.mean(recent_rewards_of_episodes), np.mean(recent_steps_of_episodes), quest1_reward_cnt, quest2_reward_cnt, quest3_reward_cnt, quest4_reward_cnt, np.mean(quest1_rewards), np.mean(quest2_rewards), np.mean(quest3_rewards), np.mean(quest4_rewards)) print(summary) if save_every is not None: if time.time() - last_save > save_every: print("Saving..") torch.save(net.state_dict(), name) with open(name_stats, "wb") as _fh: pickle.dump(stats, _fh) last_save = time.time() if notify_every is not None: if time.time() - last_notify > notify_every: print("Notify..") notify(summary) last_notify = time.time() if max_episodes is not None and episode_number == max_episodes: torch.save(net.state_dict(), name) with open(name_stats, "wb") as _fh: pickle.dump(stats, _fh) notify(summary) notify("Done.") print("Done.") sys.exit()