def Off_policy_MC_Control(env,episode_nums,discount_factor=1.0): env = Blackjack() Q = defaultdict(lambda:np.zeros(env.nA)) target_policy = defaultdict(float) return_count=defaultdict(float) for i_episode in range(1,1+episode_nums): env._reset() state = env.observation() episode=[] prob_b=[] if i_episode % 1000 == 0: print("\rEpisode {}/{}.".format(i_episode, episode_nums)) sys.stdout.flush() for i in range(100): A = sample_policy(Q,state,env.nA) probs = A action = np.random.choice(np.arange(env.nA),p=probs) next_state,reward,done = env._step(action) episode.append((state,action,reward)) prob_b.append(probs[action]) if done: break else: state = next_state seperate_episode = set([(tuple(x[0]), x[1]) for x in episode]) G =0.0 W =1 prob_b=prob_b[::-1] for idx,eps in enumerate(episode[::-1]): state,action,reward = eps pair=(state,action) G = discount_factor*G+reward return_count[pair]+=W Q[state][action]+=W*1.0/return_count[pair]*(G-Q[state][action]) target_policy[state] = np.argmax(Q[state]) if target_policy[state]!=action: break W = W*1.0/prob_b[idx] return Q
def MC_Control_with_epsilon_greedy(env,episode_nums,discount_factor=1.0, epsilon=0.1): env = Blackjack() Q = defaultdict(lambda:np.zeros(env.nA)) return_sum=defaultdict(float) return_count=defaultdict(float) for i_episode in range(1,1+episode_nums): env._reset() state = env.observation() episode=[] if i_episode % 1000 == 0: print("\rEpisode {}/{}.".format(i_episode, episode_nums)) sys.stdout.flush() for i in range(100): A = epsilon_greedy_policy(Q,state,env.nA,epsilon) probs = A action = np.random.choice(np.arange(env.nA),p=probs) next_state,reward,done = env._step(action) episode.append((state,action,reward)) if done: break else: state = next_state seperate_episode = set([(tuple(x[0]), x[1]) for x in episode]) for state,action in seperate_episode: for idx,e in enumerate(episode): if e[0]==state and e[1]==action: first_visit_idx = idx break pair = (state,action) G = sum([e[2]*(discount_factor**i) for i,e in enumerate(episode[first_visit_idx:])]) return_sum[pair]+=G return_count[pair]+=1.0 Q[state][action]=return_sum[pair]*1.0/return_count[pair] return Q