Пример #1
0
def double_q_learning(env,
                      num_episodes,
                      discount_factor=1.0,
                      alpha=0.5,
                      epsilon=0.1):

    #Off Policy TD - Find Optimal Greedy policy while following epsilon-greedy policy

    Q_A = defaultdict(lambda: np.zeros(env.action_space.n))

    Q_B = defaultdict(lambda: np.zeros(env.action_space.n))

    Total_Q = defaultdict(lambda: np.zeros(env.action_space.n))

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # state = 0
    # actions_init = 0
    # Total_Q[state][actions_init] = Q_A[state][actions_init] + Q_B[state][actions_init]

    #choose a based on Q_A for now
    policy = make_epsilon_greedy_policy(Total_Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):

        state = env.reset()

        for t in itertools.count():

            #choose a from policy derived from Q1 + Q2 (epsilon greedy here)
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            # with taken aciton, observe the reward and the next state
            next_state, reward, done, _, = env.step(action)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            #choose randomly either update A or update B
            #randmly generate a for being 1 or 2
            random_number = random.randint(1, 2)

            if random_number == 1:
                best_action_Q_A = np.argmax(Q_A[next_state])
                TD_Target_A = reward + discount_factor * Q_B[next_state][
                    best_action_Q_A]
                TD_Delta_A = TD_Target_A - Q_A[state][action]
                Q_A[state][action] += alpha * TD_Delta_A

            elif random_number == 2:
                best_action_Q_B = np.argmax(Q_B[next_state])
                TD_Target_B = reward + discount_factor * Q_A[next_state][
                    best_action_Q_B]
                TD_Delta_B = TD_Target_B - Q_B[state][action]
                Q_B[state][action] += alpha * TD_Delta_B

            if done:
                break

            state = next_state
            Total_Q[state][action] = Q_A[state][action] + Q_B[state][action]

    return Total_Q, stats
def two_step_tree_backup(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):


    Q = defaultdict(lambda : np.zeros(env.action_space.n))
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),episode_rewards=np.zeros(num_episodes))  

    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):

    	print "Number of Episodes, Two Step Tree Backup", i_episode

        state = env.reset()

        #steps within each episode
        for t in itertools.count():
            #pick the first action
            #choose A from S using policy derived from Q (epsilon-greedy)
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)), p = action_probs)

            #reward and next state based on the action chosen according to epislon greedy policy
            next_state, reward, done , _ = env.step(action)
            
            #reward by taking action under the policy pi
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)), p =next_action_probs )

            #V = sum_a pi(a, s_{t+1})Q(s_{t+1}, a)
            V = np.sum(next_action_probs * Q[next_state])


            next_next_state, next_reward, _, _ = env.step(next_action)
    
            next_next_action_probs = policy(next_next_state)
            next_next_action = np.random.choice(np.arange(len(next_next_action_probs)), p = next_next_action_probs)

            next_V = np.sum(next_next_action_probs * Q[next_next_state])            

            Delta = next_reward + discount_factor * next_V - Q[next_state][next_action]

            # print "Delta :", Delta

            # print "Next Action Prob ", np.max(next_action_probs)

            next_action_selection_probability = np.max(next_action_probs)

            td_target = reward + discount_factor * V +  discount_factor *  next_action_selection_probability * Delta

            td_delta = td_target - Q[state][action]

            Q[state][action] += alpha * td_delta


            if done:
                break

            state = next_state

    return stats
def Q_Sigma_Off_Policy(env, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1):

	Q = defaultdict(lambda : np.zeros(env.action_space.n))
	stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),episode_rewards=np.zeros(num_episodes))  


	# policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
	tau = 1
	tau_decay = 0.999

	sigma = 1
	sigma_decay = 0.995

	for i_episode in range(num_episodes):

		print "Number of Episodes, Q(sigma) Off Policy", i_episode

		policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

		off_policy = behaviour_policy_epsilon_greedy(Q, tau, env.action_space.n)

		tau = tau * tau_decay

		if tau < 0.0001:
			tau = 0.0001

		state = env.reset()

		for t in itertools.count():

			action_probs = off_policy(state)

			action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

			state_t_1, reward, done, _ = env.step(action)

			if done:
				sigma = sigma * sigma_decay
				if sigma < 0.0001:
					sigma = 0.0001
				break				

			stats.episode_rewards[i_episode] += reward
			stats.episode_lengths[i_episode] = t

			# #select sigma value
			# probability = 0.5
			# sigma_t_1 = binomial_sigma(probability)

			sigma_t_1 = sigma

			#select next action based on the behaviour policy at next state
			next_action_probs = off_policy(state_t_1)
			action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs)


			on_policy_next_action_probs = policy(state_t_1)
			on_policy_a_t_1 = np.random.choice(np.arange(len(on_policy_next_action_probs)), p = on_policy_next_action_probs)
			V_t_1 = np.sum( on_policy_next_action_probs * Q[state_t_1] )

			Delta_t = reward + discount_factor * (  sigma_t_1 * Q[state_t_1][action_t_1] + (1 - sigma_t_1) * V_t_1  ) - Q[state][action]

			Q[state][action] += alpha * Delta_t


			state = state_t_1


	return stats
Пример #4
0
def q_lambda_watkins(env,
                     num_episodes,
                     discount_factor=1.0,
                     alpha=0.1,
                     epsilon=0.1):

    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  episode_error=np.zeros(num_episodes))

    lambda_param = np.array(
        [0, 0.1, 0.15, 0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.975, 0.99, 1])

    alpha = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1])

    All_Rwd_Lambda = np.zeros(shape=(num_episodes, len(lambda_param)))
    All_Lambda_Alpha = np.zeros(shape=(len(lambda_param), len(alpha)))

    All_Error_Lambda = np.zeros(shape=(num_episodes, len(lambda_param)))
    All_Error_Lambda_Alpha = np.zeros(shape=(len(lambda_param), len(alpha)))

    num_experiments = num_episodes

    for l in range(len(lambda_param)):

        print "Lambda Param", lambda_param[l]

        for alpha_param in range(len(alpha)):

            print "Alpha Param", alpha[alpha_param]

            for i_episode in range(num_episodes):

                print "Number of Episodes, Q(lambda) Watkins", i_episode

                policy = make_epsilon_greedy_policy(Q, epsilon,
                                                    env.action_space.n)
                state = env.reset()
                next_action = None

                action_probs = policy(state)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)

                eligibility = defaultdict(lambda: np.zeros(env.action_space.n))
                #initialising eligibility traces

                for t in itertools.count():

                    next_state, reward, done, _ = env.step(action)

                    stats.episode_rewards[i_episode] += reward
                    stats.episode_lengths[i_episode] = t

                    next_action_probs = policy(next_state)
                    next_action = np.random.choice(np.arange(
                        len(next_action_probs)),
                                                   p=next_action_probs)

                    best_action = np.argmax(Q[next_state])

                    Delta = reward + discount_factor * Q[next_state][
                        best_action] - Q[state][action]

                    rms_error = np.sqrt(
                        np.sum(
                            (reward + discount_factor * V - Q[state][action])**
                            2) / num_experiments)
                    stats.episode_error[i_episode] += rms_error

                    eligibility[state][action] = eligibility[state][action] + 1

                    for s in range(env.observation_space.n):
                        for a in range(env.action_space.n):
                            Q[s][a] = Q[s][a] + alpha[
                                alpha_param] * Delta * eligibility[s][a]

                            eligibility[s][a] = eligibility[s][
                                a] * discount_factor * lambda_param[l]

                    if done:
                        break

                    action = next_action
                    state = next_state

            cum_rwd_per_episode = np.array([
                pd.Series(stats.episode_rewards).rolling(1,
                                                         min_periods=1).mean()
            ])
            cum_error_per_episode = np.array([
                pd.Series(stats.episode_error).rolling(1,
                                                       min_periods=1).mean()
            ])

            All_Rwd_Lambda[:, l] = cum_rwd_per_episode
            All_Error_Lambda[:, l] = cum_error_per_episode
            All_Lambda_Alpha[l, alpha_param] = cum_error_per_episode.T[-1]
            All_Error_Lambda_Alpha[l,
                                   alpha_param] = cum_error_per_episode.T[-1]

    return All_Rwd_Lambda, All_Lambda_Alpha, All_Error_Lambda, All_Error_Lambda_Alpha
def two_step_q_sigma_on_policy(env,
                               num_episodes,
                               discount_factor=1.0,
                               alpha=0.5,
                               epsilon=0.9):

    #Expected SARSA : same algorithm steps as Q-Learning,
    # only difference : instead of maximum over next state and action pairs
    # use the expected value
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):

        state = env.reset()
        action_probs = policy(state)

        #choose a from policy derived from Q (which is epsilon-greedy)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

        #steps within each episode
        for t in itertools.count():

            sigma = random.randint(0, 1)

            #if using a random number for sigma
            #sigma = np.random.rand(1)

            next_state, reward, done, _ = env.step(action)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)

            V = np.sum(next_action_probs * Q[next_state])
            One_Sigma_Effect = sigma * Q[next_state][next_action] + (1 -
                                                                     sigma) * V
            One_Step = reward + discount_factor * One_Sigma_Effect

            next_action_selection_probability = np.max(next_action_probs)
            Two_Step = -discount_factor * (
                1 - sigma
            ) * next_action_selection_probability * Q[next_state][next_action]

            next_next_state, next_reward, _, _ = env.step(next_action)
            next_next_action_probs = policy(next_next_state)
            next_next_action = np.random.choice(np.arange(
                len(next_next_action_probs)),
                                                p=next_next_action_probs)

            V_next = np.sum(next_next_action_probs * Q[next_next_state])
            Three_Sigma_Effect = sigma * Q[next_next_state][
                next_next_action] + (1 - sigma) * V_next
            Int_Three_Step = next_reward + discount_factor * Three_Sigma_Effect
            Three_Step = discount_factor * (
                1 - sigma) * next_action_selection_probability * Int_Three_Step

            Fourth_Step = -discount_factor * sigma * Q[next_state][next_action]

            Fifth_Sigma_Effect = sigma * Q[next_next_state][
                next_next_action] + (1 - sigma) * V_next
            Int_Fifth_Step = discount_factor * Fifth_Sigma_Effect
            Int_Int_Fifth_Step = next_reward + Int_Fifth_Step
            Fifth_Step = discount_factor * sigma * Int_Int_Fifth_Step

            td_target = One_Step + Two_Step + Three_Step + Fourth_Step + Fifth_Step

            td_delta = td_target - Q[state][action]

            Q[state][action] += alpha * td_delta

            if done:
                break

            action = next_action
            state = next_state

    return Q, stats
Пример #6
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        update_target_estimator_every: Copy parameters from the Q estimator to the
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    # checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")
    macro_dir = os.path.join(experiment_dir, "macro")
    macro_path = os.path.join(macro_dir, "macro.txt")

    worker_dir = os.path.abspath("./{}/{}".format(WORKER_SAVE_DIR,
                                                  env.spec.id))
    output_path = os.path.join(worker_dir, "output_score")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)
    if not os.path.exists(macro_dir):
        os.makedirs(macro_dir)

    # Saver to save/restore weights
    saver = tf.train.Saver()

    ckpt_idx, ckpt = get_latest_ckpt(os.listdir(checkpoint_dir))
    if ckpt is not None:
        os.system('cp -r {} ./tmp/'.format(checkpoint_dir))
        ckpt = os.path.join('./tmp/checkpoints', ckpt)

        saver.restore(sess, ckpt)
        print('Restore model_{}.ckpt'.format(ckpt_idx))

    # The replay memory
    replay_memory = []

    total_t = sess.run(
        tf.contrib.framework.get_global_step())  # total_t = 0, for q value
    tf_update_idx = 0  # for plotting graph (reward and epsilon)

    # Restore replay memory if exists
    if os.path.isfile(macro_path):
        macro_log = load_log(
            macro_path
        )  # [[skills, reward, epsilon], [skills, reward, epsilon], ...]
        print("Reading macro.txt")
        print("Resume training from {} skills...".format(len(macro_log)))
        for log_skills in macro_log:
            replay_memory = macro_log_to_replay_memory(replay_memory,
                                                       log_skills,
                                                       VALID_ACTIONS,
                                                       REWARD_FACTOR)
            tf_update_idx += 1  # for plotting

    assert tf_update_idx >= ckpt_idx, 'Unexpected checkpoint. Checkpoint version is higher than the replay memory.'

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, NUM_EXPLORE_SKILL)
    _ = np.full(shape=(MAX_NUM_TOTAL_SKILL - NUM_EXPLORE_SKILL, ),
                fill_value=epsilon_end)
    epsilons = np.concatenate((epsilons, _), axis=0)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    state = env.reset()

    strat_time = time.time()

    # populating initial skills, random generate skill
    while tf_update_idx < NUM_INIT_SKILL:
        # Generate action sequence
        print("{}/{} skills @ Init replay memory".format(
            tf_update_idx + 1, NUM_INIT_SKILL))
        actions = []
        state = env.reset()
        epsilon = epsilons[0]  # 1.0
        while True:
            action_probs = policy(
                sess, state, epsilon)  # epsilon = 1.0, totally random search
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, _, done, _ = env.step(VALID_ACTIONS[action])
            # collect actions sequence
            actions.append(VALID_ACTIONS[action])
            if done:
                break

            else:
                state = next_state

        # map action to workers
        print("Waiting for mapping...")
        env.map_single_reward(
            actions, epsilon)  # return when actions is sent to a worker

        # check if any macro ensemble be calculated.
        while tf_update_idx < NUM_INIT_SKILL:
            skill, reward, epsilon = read_score(
                output_path)  # Reutrn [None, None] if nothing new

            if skill is not None:
                factor_episode_reward = reward * REWARD_FACTOR
                # Write log into tensorboard
                episode_summary = tf.Summary()
                episode_summary.value.add(simple_value=factor_episode_reward,
                                          node_name="factor_episode_reward",
                                          tag="factor_episode_reward")
                episode_summary.value.add(simple_value=reward,
                                          node_name="episode_reward",
                                          tag="episode_reward")
                episode_summary.value.add(simple_value=epsilon,
                                          node_name="epsilon",
                                          tag="epsilon")
                episode_summary.value.add(simple_value=LEN_SKILL,
                                          node_name="episode_length",
                                          tag="episode_length")
                q_estimator.summary_writer.add_summary(episode_summary,
                                                       tf_update_idx)
                q_estimator.summary_writer.flush()
                tf_update_idx += 1
                write_reward_to_log(macro_path,
                                    skill,
                                    reward,
                                    epsilon,
                                    NUM_SKILL=NUM_SKILL,
                                    MAXLEN_PER_SKILL=MAXLEN_PER_SKILL)

                # add the new macro ensemble into the replay buffer
                skill = np.array(skill).reshape(-1).tolist()

                state = env.reset()
                for a in skill:
                    next_state, _, done, _ = env.step(
                        a)  # a has been mapped to VALID_ACTIONS

                    # Add data to replay memory
                    if done:
                        replay_memory.append(
                            Transition(state, VALID_ACTIONS.index(a),
                                       factor_episode_reward, next_state,
                                       done))

                    else:
                        replay_memory.append(
                            Transition(state, VALID_ACTIONS.index(a), 0,
                                       next_state, done))
                        state = next_state
            else:
                break

    # train model (Number of loaded macro > NUM_INIT_MODEL)
    if tf_update_idx > NUM_INIT_SKILL:
        for i in range(tf_update_idx - NUM_INIT_SKILL):
            partial_memory = replay_memory[:LEN_SKILL * (NUM_INIT_SKILL + i)]
            # Train agent
            if NUM_INIT_SKILL + i > ckpt_idx:
                q_estimator, target_estimator = train_agent(
                    sess=sess,
                    replay_memory=partial_memory,
                    q_estimator=q_estimator,
                    target_estimator=target_estimator,
                    NUM_EPOCH=NUM_EPOCH,
                    discount_factor=discount_factor,
                    BATCH_SIZE=BATCH_SIZE,
                    NUM_TOTAL_SKILL=MAX_NUM_TOTAL_SKILL,
                    update_target_estimator_every=update_target_estimator_every,
                    tf_update_idx=NUM_INIT_SKILL + i,  # current skill id
                )

    # Main training loop
    skill = []  # Data sent back from worker. Initialize skill to null.
    while tf_update_idx < MAX_NUM_TOTAL_SKILL:
        if time.time() - strat_time > MAX_LIMIT_TIME:
            break
        # Save model
        if tf_update_idx % 50 == 0:
            ckpt = "model_{}.ckpt".format(tf_update_idx)
            print('Saving model_{}.ckpt'.format(tf_update_idx))
            saver.save(sess, os.path.join(checkpoint_dir, ckpt))

        # If our replay memory is full, pop the first element
        while len(replay_memory) > replay_memory_size:
            replay_memory.pop(0)

        if skill is not None:
            # Train agent with
            if tf_update_idx > ckpt_idx:

                q_estimator, target_estimator = train_agent(
                    sess=sess,
                    replay_memory=replay_memory,
                    q_estimator=q_estimator,
                    target_estimator=target_estimator,
                    NUM_EPOCH=NUM_EPOCH,
                    discount_factor=discount_factor,
                    BATCH_SIZE=BATCH_SIZE,
                    NUM_TOTAL_SKILL=MAX_NUM_TOTAL_SKILL,
                    update_target_estimator_every=update_target_estimator_every,
                    tf_update_idx=tf_update_idx,
                )

        # populate a new macro ensemble
        if skill is None:
            print("{}/{} th skills".format(tf_update_idx + 1,
                                           MAX_NUM_TOTAL_SKILL))
            actions = []
            epsilon = epsilons[
                tf_update_idx -
                NUM_INIT_SKILL]  # Epsilon for this macro ensemble
            state = env.reset()
            while True:
                action_probs = policy(sess, state, epsilon)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
                next_state, _, done, _ = env.step(VALID_ACTIONS[action])
                # collect action sequence
                actions.append(VALID_ACTIONS[action])
                if done:
                    break
                    # if args.dup:    # duplicated macro action in one ensemble is valid.
                    #     break
                    # else:           # add penalty to duplicated macro action
                    #     replay_memory, states, next_states, actions, dones, is_dup = check_duplicated_macro_action(
                    #                                                         replay_memory=replay_memory,
                    #                                                         states=states, next_states=next_states,
                    #                                                         actions=actions, dones=dones)
                    #     if is_dup:
                    #         continue
                    #     else:
                    #         break
                else:
                    state = next_state

            # map action to workers
            print("Waiting for mapping...")
            env.map_single_reward(
                actions, epsilon)  # return when actions is sent to a worker

        # check if any macro ensemble be calculated.
        skill, reward, epsilon = read_score(
            output_path)  # Reutrn [None, None] if nothing new

        if skill is not None:
            factor_episode_reward = reward * REWARD_FACTOR
            # Write log into tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=factor_episode_reward,
                                      node_name="factor_episode_reward",
                                      tag="factor_episode_reward")
            episode_summary.value.add(simple_value=reward,
                                      node_name="episode_reward",
                                      tag="episode_reward")
            episode_summary.value.add(simple_value=epsilon,
                                      node_name="epsilon",
                                      tag="epsilon")
            episode_summary.value.add(simple_value=LEN_SKILL,
                                      node_name="episode_length",
                                      tag="episode_length")
            q_estimator.summary_writer.add_summary(episode_summary,
                                                   tf_update_idx)
            q_estimator.summary_writer.flush()
            tf_update_idx += 1
            write_reward_to_log(macro_path,
                                skill,
                                reward,
                                epsilon,
                                NUM_SKILL=NUM_SKILL,
                                MAXLEN_PER_SKILL=MAXLEN_PER_SKILL)

            # add the new macro ensemble into the replay buffer
            skill = np.array(skill).reshape(-1).tolist()

            state = env.reset()
            for a in skill:
                next_state, _, done, _ = env.step(
                    a)  # a has been mapped to VALID_ACTIONS

                # Add data to replay memory
                if done:
                    replay_memory.append(
                        Transition(state, VALID_ACTIONS.index(a),
                                   factor_episode_reward, next_state, done))

                else:
                    replay_memory.append(
                        Transition(state, VALID_ACTIONS.index(a), 0,
                                   next_state, done))
                    state = next_state
            READ_REWARD_AGAIN = True

    env.close()
    return stats
def q_learning(env,
               estimator,
               num_episodes,
               discount_factor=1.0,
               epsilon=0.1,
               epsilon_decay=1.0):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i_episode in range(num_episodes):

        # The policy we're following
        policy = make_epsilon_greedy_policy(estimator,
                                            epsilon * epsilon_decay**i_episode,
                                            env.action_space.n)

        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        sys.stdout.flush()

        # Reset the environment and pick the first action
        state = env.reset()

        # Only used for SARSA, not Q-Learning
        next_action = None

        # One step in the environment
        for t in itertools.count():

            # Choose an action to take
            # If we're using SARSA we already decided in the previous step
            if next_action is None:
                action_probs = policy(state)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
            else:
                action = next_action

            # Take a step
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # TD Update
            q_values_next = estimator.predict(next_state)

            # Use this code for Q-Learning
            # Q-Value TD Target
            td_target = reward + discount_factor * np.max(q_values_next)

            # Use this code for SARSA TD Target for on policy-training:
            # next_action_probs = policy(next_state)
            # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)
            # td_target = reward + discount_factor * q_values_next[next_action]

            # Update the function approximator using our target
            estimator.update(state, action, td_target)

            print("\rStep {} @ Episode {}/{} ({})".format(
                t, i_episode + 1, num_episodes, last_reward),
                  end="")

            if done:
                break

            state = next_state

    return stats
Пример #8
0
def actor_critic(env, estimator_policy_X, estimator_value_X, trainer_X, num_episodes, discount_factor=1.0, player2=True, positiveRewardFactor=1.0, negativeRewardFactor=1.0, batch_size=1):
    """
    Actor Critic Algorithm. Optimizes the policy
    function approximator using policy gradient.

    Args:
        env: OpenAI environment.
        estimator_policy_X: Policy Function to be optimized
        estimator_value_X: Value function approximator, used as a critic
        trainer_X: our training class
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
        player2: True if computer plays player2, False if user does
        positiveRewardFactor: Factor bla bla bla reward
        negativeRewardFactor: Factor bla bla bla
        batch_size: Batch size

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes),
        episode_td_error=np.zeros(num_episodes),
        episode_value_loss=np.zeros(num_episodes),
        episode_policy_loss=np.zeros(num_episodes),
        episode_kl_divergence=np.zeros(num_episodes))

    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    batch_board_X = np.zeros((batch_size, 7, 6, 2))
    batch_player_X = np.zeros((batch_size, 2))
    batch_td_target_X = np.zeros((batch_size, 1))
    batch_td_error_X =np.zeros((batch_size, 1))
    batch_action_X =np.zeros((batch_size, 1))
    batch_avaliableColumns_X = np.zeros((batch_size, 7))


    batch_pos_X = 0

    game = 1

    for i_episode in range(num_episodes):
        # Reset the environment and pick the first action
        state = env.reset(i_episode % 2 + 1)
        robotLevel = i_episode%4 + 1

        episode = []

        probas = None
        last_turn = False
        done = False
        last_state = None
        action = None
        reward = None

        # if game % 5000 == 10:
        #     player2 = True
        # elif game % 5000 == 0:
        #     player2 = False

        if game == num_episodes-3:
            player2 = False

        # One step in the environment
        for t in itertools.count():
            # Save avaliable columns
            if not done:
                avaliableColumns = env.getAvaliableColumns()

            currentPlayerBeforeStep = env.getCurrentPlayer()

            action_tmp = action

            # Take a step
            if currentPlayerBeforeStep == 1 and not done or currentPlayerBeforeStep == 2 and player2 and not done:
                action, probas = estimator_policy_X.predict(env)
                action = action[0]
                probas = probas[0]
            elif not done:
                try:
                    action = int(input("Give a column number: ")) - 1
                except ValueError:
                    print("Wrong input! Setting action to 1")
                    action = 0
                probas = None

            if currentPlayerBeforeStep == 2 and player2 and not done:
                next_state, reward, step_done, action = env.robotStep(robotLevel)
            elif not done:
                next_state, reward, step_done, _ = env.step(action)

            if not done:

                if game == num_episodes-3:
                    pass
                    #layer1, layer2 = trainer_X.evalFilters(next_state[1])
                    #plotting.plotNNFilter(next_state[1], layer1, layer2)


                if step_done:
                    pass

                if t > 0:
                    state_tmp = last_state
                    last_state = state
                    reward_tmp = -reward*negativeRewardFactor
                else:
                    state_tmp = state
                    last_state = state
                    reward_tmp = -reward*negativeRewardFactor


            elif done and not last_turn:
                state_tmp = episode[-2].next_state
                reward_tmp = reward*positiveRewardFactor
            else:
                break




            if t > 0:
                episode.append(Transition(
                    state=state_tmp, action=action_tmp, reward=reward_tmp, next_state=next_state, done=done))

                player = None
                if episode[-1].state[0][0] == 1:
                    player = "X"
                elif episode[-1].state[0][1] == 1:
                    player = "O"
                # Update statistics
                stats.episode_lengths[i_episode] = t

                # If player 0 (X)
                if episode[-1].state[0][0] == 1 or True:

                    if episode[-1].state[0][0] == 1:
                        stats.episode_rewards[i_episode] += episode[-1].reward
                    # Calculate TD Target
                    value_next = estimator_value_X.predict(episode[-1].next_state)
                    td_target = episode[-1].reward + discount_factor * value_next
                    td_error = td_target - estimator_value_X.predict(episode[-1].state)

                    if episode[-1].state[0][0] == 1:
                        batch_board_X[batch_pos_X] = episode[-1].state[1]
                    else:
                        batch_board_X[batch_pos_X] = invertBoard(episode[-1].state[1])
                    batch_player_X[batch_pos_X] = episode[-1].state[0]
                    batch_td_target_X[batch_pos_X] = td_target
                    batch_td_error_X[batch_pos_X] = td_error
                    batch_action_X[batch_pos_X] = episode[-1].action
                    batch_avaliableColumns_X[batch_pos_X] = avaliableColumns

                    batch_pos_X += 1
                # else:
                #     value_next = estimator_value_O.predict(episode[-1].next_state, )
                #     td_target = episode[-1].reward + discount_factor * value_next
                #     td_error = td_target - estimator_value_O.predict(episode[-1].state)
                #
                #     batch_player_O[batch_pos_O] = episode[-1].state[0]
                #     batch_board_O[batch_pos_O] = episode[-1].state[1]
                #     batch_td_target_O[batch_pos_O] = td_target
                #     batch_td_error_O[batch_pos_O] = td_error
                #     batch_action_O[batch_pos_O] = episode[-1].action
                #     batch_avaliableColumns_O[batch_pos_O] = avaliableColumns
                #
                #     batch_pos_O += 1


                stats.episode_td_error[i_episode] += td_error

                if batch_pos_X == batch_size:
                    # Update both networks
                    loss_X, policyLoss, valueLoss = trainer_X.update(batch_board_X, batch_td_target_X, batch_td_error_X, batch_action_X, batch_avaliableColumns_X)
                    loss_X = loss_X[0][0]
                    policyLoss = policyLoss[0][0]
                    valueLoss = valueLoss[0][0]
                    batch_pos_X = 0

                    print("Updates X network. Loss:", loss_X)
                    stats.episode_value_loss[i_episode] += valueLoss

                # if batch_pos_O == batch_size:
                #     # Update both networks
                #     loss_O = trainer_O.update(batch_board_O, batch_td_target_O, batch_td_error_O, batch_action_O,
                #                               batch_avaliableColumns_O)
                #     loss_O = loss_O[0][0]
                #     batch_pos_O = 0
                #
                #     print("Updates X network. Loss:", loss_O)
                #     stats.episode_value_loss[i_episode] += loss_O

                    if probas is not None and last_probas is not None:
                        kl_div = 0
                        for i in range(probas.size):
                            kl_div += probas[i]*np.log(probas[i]/last_probas[i])
                        stats.episode_kl_divergence[i_episode] += kl_div

                # Print out which step we're on, useful for debugging.
                print(
                    "\rPlayer {}: Action {}, Reward {:<4}, TD Error {:<20}, TD Target {:<20}, Value Next {:<20}, at Step {:<5} @ Game {} @ Episode {}/{} ({})".format(
                        player, int(episode[-1].action + 1), episode[-1].reward, td_error, td_target, value_next, t,
                        game, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="")

                if player == "X" and episode[-1].reward > 0 and robotLevel > 1:# or i_episode % 100 == 0:
                    for i in range(t):
                        print("Player:", batch_player_X[batch_pos_X-t+i], "Action:", int(batch_action_X[batch_pos_X-t+i])+1 )
                    print("Robot level:", robotLevel)
                    env.renderHotEncodedState( ((1, 0), batch_board_X[batch_pos_X-1]) )

            if game == num_episodes or env.getCurrentPlayer() == 2 and not player2:
                env.render()
                if probas is not None:
                    out = " "
                    for i in range(probas.size):
                        out += "%03d " % int(probas[i]*100+0.5)
                    print(out)

            last_probas = probas

            if done:
                last_turn = True
                game += 1


            if step_done:
                done = True

            state = next_state

    return stats
Пример #9
0
def main():

    global nTrucks
    global state
    global num_of_load
    global num_of_dump
    global num_of_return
    BucketA_capacity = 1.5
    BucketB_capacity = 1.0
    Truck1_capacity = 6
    Truck2_capacity = 3
    Truck1_speed = 15.0
    Truck2_speed = 20.0
    Truck1_speedRatio = Truck1_speed / (Truck1_speed + Truck2_speed)
    Truck2_speedRatio = Truck2_speed / (Truck1_speed + Truck2_speed)
    i_episode = 0

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes),
        episode_loss=np.zeros(num_episodes))

    #seed - TODO seed()
    np.random.seed(0)

    #initialize master and workers
    tf.reset_default_graph()
    with tf.device("/cpu:0"):
        trainer_critic = tf.train.AdamOptimizer(learning_rate=alpha_critic)
        trainer_actor = tf.train.AdamOptimizer(learning_rate=alpha_actor)
        master_network = AC_Network(nS, nA, 'global', None, None)
        #num_threads = multiprocessing.cpu_count() #TODO does each thread run on a different cpu core ?
        num_threads = 2
        workers = []
        #create workers
        for i in range(num_threads):
            workers.append(Worker(i, nS, nA, trainer_critic, trainer_actor))

    #set up session
    with tf.Session() as sess:
        coord = tf.train.Coordinator()
        sess.run(tf.global_variables_initializer())

        #start episodes
        while (i_episode + num_threads) < num_episodes:
            #reset vars
            num_of_load = np.zeros(num_episodes)
            num_of_dump = np.zeros(num_episodes)
            num_of_return = np.zeros(num_episodes)
            state[i_episode] = np.zeros(nS)
            old_state = np.zeros((nTrucks,nS))
            old_time = np.zeros(nTrucks)
            old_action = np.zeros(nTrucks).astype(int)
            Iterations = 0 #number of decision iterations in an episode
            Mean_TD_Error = 0

            #initialize environment threads
            env_threads = []
            for worker in workers:
                run_sim_args = [nTrucks, BucketA_capacity, BucketB_capacity, Truck1_capacity, Truck2_capacity, Truck1_speedRatio, Truck2_speedRatio, worker, old_state, old_time, old_action, Iterations, Mean_TD_Error, i_episode, coord, sess]
                # Print num of episode
                print "\rEpisode: ", i_episode + 1, " / ", num_episodes
                i_episode += 1
                t = threading.Thread(target=run_sim, args=run_sim_args)
                t.start()
                env_threads.append(t)
                #time.sleep(1)
            coord.join(env_threads)

        for i in range(num_episodes):
            if i >= i_episode:
                stats.episode_lengths[i] = Hrs[i_episode-1]
                stats.episode_rewards[i] = ProdRate[i_episode-1]
                stats.episode_loss[i] = Mean_Loss[i_episode-1]
                #print "Mean_Loss[%d] = %r \t eploss[%d] = %r" % (i_episode-1, Mean_Loss[i_episode-1], i, stats.episode_loss[i])
            else:
                stats.episode_lengths[i] = Hrs[i]
                stats.episode_rewards[i] = ProdRate[i]
                stats.episode_loss[i] = Mean_Loss[i]
                #print "Mean_Loss[%d] = %r \t eploss[%d] = %r" % (i, Mean_Loss[i], i, stats.episode_loss[i])
        plotting.plot_episode_stats(stats, name='A3C' smoothing_window=20)
Пример #10
0
def q_learning(env,
               num_episodes,
               discount_factor=0.9,
               alpha=0.8):  #, epsilon=0.1):
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
    while following an epsilon-greedy policy
    
    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, episode_lengths).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    memory = defaultdict(list)

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # The policy we're following
    #policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        # Print out which episode we're on, useful for debugging.
        #print("Episode ", i_episode)
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes),
                  end="")
            #sys.stdout.flush()

        # Reset the environment and pick the first action
        eigenState = env.reset()

        # One step in the environment
        # total_reward = 0.0
        for t in itertools.count():
            if eigenState in memory:
                memList = memory[eigenState]
                action = memList[0]
                stateValue = memList[1]
                nextState = memList[2]

                if nextState in memory:
                    nextStateValue = memory[nextState][1]
                else:
                    nextStateValue = 0.0
                reward = memList[3]

                Q_program = QuantumProgram()
                qr = Q_program.create_quantum_register("qr", 2)
                cr = Q_program.create_classical_register("cr", 2)
                eigenAction = Q_program.create_circuit("superposition", [qr],
                                                       [cr])
                eigenAction.h(qr)
                eigenAction, qr = groverIteration(Q_program, eigenAction, qr,
                                                  action, reward,
                                                  nextStateValue)

            else:
                #################### Prepare the n-qubit registers #########################################
                Q_program = QuantumProgram()
                qr = Q_program.create_quantum_register("qr", 2)
                cr = Q_program.create_classical_register("cr", 2)
                eigenAction = Q_program.create_circuit("superposition", [qr],
                                                       [cr])
                eigenAction.h(qr)
                ############################################################################################

                stateValue = 0.0

            action = collapseActionSelectionMethod(Q_program, eigenAction, qr,
                                                   cr)
            nextEigenState, reward, done = env.step(action)

            if nextEigenState in memory:
                memList = memory[nextEigenState]
                nextStateValue = memList[1]
            else:
                nextStateValue = 0.0

            #Update state value
            stateValue = stateValue + alpha * (
                reward + (discount_factor * nextStateValue) - stateValue)
            #print(stateValue)

            memory[eigenState] = (action, stateValue, nextEigenState, reward)

            stats.episode_rewards[i_episode] += (discount_factor**t) * reward
            stats.episode_lengths[i_episode] = t

            if done:
                break

            #state = next_state
            eigenState = nextEigenState

    return Q, stats, memory
Пример #11
0
def q_learning(env, num_episodes, discount_factor=1, alpha=0.5, epsilon=0.1):
    """
    Args:
        alpha: TD learning rate
    """
    # height = env.unwrapped.game.height
    width = env.unwrapped.game.width
    Q = defaultdict(lambda: np.zeros(ACTION_SPACE))
    # Q = defaultdict(lambda: np.random.rand(ACTION_SPACE))
    # Q = defaultdict(lambda: np.ones(ACTION_SPACE))
    goal_int = helper.convert_state(16, 1, width)

    for i in range(3):
        Q[goal_int][i] = 0
    
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes),
        episode_runtime=np.zeros(num_episodes))
    
    stats_test = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes),
        episode_runtime=np.zeros(num_episodes))

    policy = make_epsilon_greedy_policy(Q, epsilon, ACTION_SPACE)
    for i_episode in range(num_episodes):
        print("------------------------------")
        start_total_runtime = time.time()

        # Reset the env and pick the first action
        previous_state = env.reset()
        state_int = helper.convert_state(previous_state[1], previous_state[0], width)
        for t in range(cf.TIME_RANGE):
            env.render()
            # time.sleep(0.1)
            # Take a step
            action_probs = policy(state_int, i_episode)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            # action = env.action_space.sample()
            if(action == 4):
                import ipdb; ipdb.set_trace()
            # print("---------------------------------")
            # 0: UP
            # 1: DOWN
            # 2: LEFT
            # 3: RIGHT

            next_state, reward, done, _ = env.step(action)
            if done:
                reward = 10
            else:
                reward = reward - 1

            previous_state = next_state

            next_state_int = helper.convert_state(next_state[1], next_state[0], width)

            # Update stats
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # TD Update
            best_next_action = np.argmax(Q[next_state_int])

            td_target = reward + discount_factor*Q[next_state_int][best_next_action]
            td_delta = td_target - Q[state_int][action]

            Q[state_int][action] += alpha * td_delta

            if done:
                # import ipdb; ipdb.set_trace()
                break

            previous_state = next_state
            state_int = next_state_int

        stats.episode_runtime[i_episode] += (time.time()-start_total_runtime)
        run_experiment(env, Q, stats_test, i_episode, width, cf.TIME_RANGE)
    return Q, stats, stats_test
def two_step_tree_backup(env,
                         estimator,
                         num_episodes,
                         discount_factor=1.0,
                         epsilon=0.1,
                         epsilon_decay=1.0):

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    policy = make_epsilon_greedy_policy(estimator, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):

        state = env.reset()

        #steps within each episode
        for t in itertools.count():

            #pick the first action
            #choose A from S using policy derived from Q (epsilon-greedy)
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)

            #reward and next state based on the action chosen according to epislon greedy policy
            next_state, reward, done, _ = env.step(action)

            if done:
                print('Episode is ', i_episode)
                break

            #reward by taking action under the policy pi
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)

            #V = sum_a pi(a, s_{t+1})Q(s_{t+1}, a)
            V = np.sum(next_action_probs * estimator.predict(next_state))

            next_next_state, next_reward, _, _ = env.step(next_action)

            next_next_action_probs = policy(next_next_state)
            next_next_action = np.random.choice(np.arange(
                len(next_next_action_probs)),
                                                p=next_next_action_probs)

            next_V = np.sum(next_next_action_probs *
                            estimator.predict(next_next_state))

            # print "Next Action:", next_action
            # print "Next Action probs :", next_action_probs

            #Main Update Equations for Two Step Tree Backup
            Q_next_state_next_action = estimator.predict(next_state)
            Q_next_state_next_action = Q_next_state_next_action[next_action]

            Delta = next_reward + discount_factor * next_V - Q_next_state_next_action

            # print "Delta :", Delta

            # print "Next Action Prob ", np.max(next_action_probs)

            next_action_selection_probability = np.max(next_action_probs)

            td_target = reward + discount_factor * V + discount_factor * next_action_selection_probability * Delta
            estimator.update(state, action, td_target)
            state = next_state

    return stats
Пример #13
0
def main():
    global num_of_load
    global num_of_dump
    global num_of_return
    global state
    global old_state
    global old_time
    global Mean_TD_Error
    global Iterations
    global nTrucks
    global num_decisions
    global num_decisions_A
    global num_decisions_B
    global local_decisions_A
    global local_decisions_B
    global idle_count_A
    global idle_count_B
    global actions_performed_without_maintenance_A
    global actions_performed_without_maintenance_B
    global repair_downtime_remaining_A
    global repair_downtime_remaining_B
    global both_excavators_failed_flag

    global g_Truck1_capacity
    global g_Truck2_capacity

    BucketA_capacity = 6
    BucketB_capacity = 3
    Truck1_capacity = g_Truck1_capacity
    Truck2_capacity = g_Truck2_capacity
    Truck1_speed = 20
    Truck2_speed = 20
    Truck1_speedRatio = Truck1_speed / float(Truck1_speed + Truck2_speed)
    Truck2_speedRatio = Truck2_speed / float(Truck1_speed + Truck2_speed)

    #run session (initialise tf global vars)
    sess.run(init)

    num_episodes = 10000
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  episode_loss=np.zeros(num_episodes),
                                  episode_decisions_A=np.zeros(num_episodes),
                                  episode_decisions_B=np.zeros(num_episodes),
                                  lastep_decisions_A=[],
                                  lastep_decisions_B=[])

    for i_episode in range(num_episodes):
        #reset global vars
        num_of_load = 0
        num_of_dump = 0
        num_of_return = 0
        state = np.zeros(12)
        old_state = np.zeros((nTrucks, 12))
        old_time = np.zeros(nTrucks)
        Mean_TD_Error = 0
        Iterations = 0
        num_decisions = 0
        num_decisions_A = 0
        num_decisions_B = 0
        idle_count_A = 0
        idle_count_B = 0
        actions_performed_without_maintenance_A = 0
        actions_performed_without_maintenance_B = 0
        repair_downtime_remaining_A = 0
        repair_downtime_remaining_B = 0
        both_excavators_failed_flag = False

        # Print out which episode we're on, useful for debugging.
        print "\rEpisode: ", i_episode + 1, " / ", num_episodes
        #run simulation
        run_sim(nTrucks, BucketA_capacity, BucketB_capacity, Truck1_capacity,
                Truck2_capacity, Truck1_speedRatio, Truck2_speedRatio)
        stats.episode_lengths[i_episode] = Hrs[i_episode]
        stats.episode_rewards[i_episode] = ProdRate[i_episode]
        stats.episode_loss[i_episode] = abs(Mean_TD_Error)
        stats.episode_decisions_A[i_episode] = num_decisions_A
        stats.episode_decisions_B[i_episode] = num_decisions_B

    stats.lastep_decisions_A.extend(local_decisions_A)
    stats.lastep_decisions_B.extend(local_decisions_B)
    # print "local_decisions_A: ", local_decisions_A
    # print "stats.lastep_decisions_A: ", stats.lastep_decisions_A
    # print stats.lastep_decisions_A == local_decisions_A

    #plotting.plot_episode_stats(stats, name='Qlearning_20', smoothing_window=20)
    plotting.plot_episode_stats(stats,
                                name='Qlearning_20_linear',
                                smoothing_window=20)
Пример #14
0
def Q_Sigma_On_Policy_Epsilon_Dependent(env,
                                        theta,
                                        num_episodes,
                                        discount_factor=1.0,
                                        epsilon=0.1,
                                        epsilon_decay=0.999):

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))
    cumulative_errors = np.zeros(shape=(num_episodes, 1))

    alpha = 0.1
    tau = 1

    for i_episode in range(num_episodes):

        print("Epsisode Number On Policy Q(sigma) Epsilon_Sigma", i_episode)

        epsilon_sigma = epsilon * epsilon_decay**i_episode

        if epsilon_sigma <= 0.0001:
            epsilon_sigma = 0.0001

        #off_policy = behaviour_policy_Boltzmann(theta, tau, env.action_space.n)
        policy = make_epsilon_greedy_policy(theta,
                                            epsilon * epsilon_decay**i_episode,
                                            env.action_space.n)

        state = env.reset()
        next_action = None

        for t in itertools.count():

            if next_action is None:
                action_probs = policy(state)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
            else:
                action = next_action

            state_t_1, reward, done, _ = env.step(action)

            if done:
                break

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # q_values = estimator.predict(state)
            # q_values_state_action = q_values[action]
            #evaluate Q(current state, current action)
            features_state = featurize_state(state)
            q_values = np.dot(theta.T, features_state)
            q_values_state_action = q_values[action]

            if np.random.rand() < epsilon_sigma:
                sigma_t_1 = 0
            else:
                sigma_t_1 = 1

            #select next action based on the behaviour policy at next state
            next_action_probs = policy(state_t_1)
            action_t_1 = np.random.choice(np.arange(len(next_action_probs)),
                                          p=next_action_probs)

            # q_values_t_1 = estimator.predict(state_t_1)
            # q_values_next_state_next_action = q_values_t_1[action_t_1]
            features_state_1 = featurize_state(state_t_1)
            q_values_t_1 = np.dot(theta.T, features_state_1)
            q_values_next_state_next_action = q_values_t_1[action_t_1]

            on_policy_next_action_probs = policy(state_t_1)
            on_policy_a_t_1 = np.random.choice(np.arange(
                len(on_policy_next_action_probs)),
                                               p=on_policy_next_action_probs)
            V_t_1 = np.sum(on_policy_next_action_probs * q_values_t_1)

            Delta_t = reward + discount_factor * (
                sigma_t_1 * q_values_next_state_next_action +
                (1 - sigma_t_1) * V_t_1) - q_values_state_action
            """
			target for one step
			1 step TD Target --- G_t(1)
			"""
            td_target = q_values_state_action + Delta_t

            td_error = td_target - q_values_state_action

            # estimator.update(state, action, new_td_target)
            theta[:, action] += alpha * td_error * features_state
            #rms_error = np.sqrt(np.sum((td_error)**2))
            #cumulative_errors[i_episode, :] += rms_error

            state = state_t_1

    return stats  #,cumulative_errors
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1):
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
    while following an epsilon-greedy policy
    
    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        alpha: TD learning rate.
        epsilon: Chance to sample a random action. Float between 0 and 1.
    
    Returns:
        A tuple (Q, episode_lengths).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """
    
    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.nA))

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))    
    
    # The policy we're following
    policy = make_epsilon_greedy_policy(Q, epsilon, env.nA)
    
    for i_episode in range(num_episodes):
        # Print out which episode we're on, useful for debugging.
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="")
            sys.stdout.flush()
        
        # Implement this!
        # Reset the environment and pick the first action
        #YOU HAVE TO REWRITE THIS, OTHERWISE IT WILL GET RID OF ALL STUDENTS
        state = env.reset()
        
        # One step in the environment
        # total_reward = 0.0
        for t in itertools.count():
            
            # Take a step
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            #print("----------")
            #print(action, "action")
            #print(reward, "reward")
            #print(next_state, "next_state")
            
            
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # TD Update
            best_next_action = np.argmax(Q[next_state])    
            td_target = reward + discount_factor * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta
                
            if done:
                break
            
            state = next_state
            #print(state)
    return Q, stats
Пример #16
0
    def q_learning(env,
                   num_episodes,
                   discount_factor=0.9,
                   alpha=0.9,
                   epsilon=0.1):
        """
       Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
       while following an epsilon-greedy policy

       Args:
           env: OpenAI environment.
           num_episodes: Number of episodes to run for.
           discount_factor: Gamma discount factor.
           alpha: TD learning rate.
           epsilon: Chance to sample a random action. Float between 0 and 1.

       Returns:
           A tuple (Q, episode_lengths).
           Q is the optimal action-value function, a dictionary mapping state -> action values.
           stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
       """

        # The final action-value function.
        # A nested dictionary that maps state -> (action -> action-value).
        step_number = 20
        Q = defaultdict(lambda: np.ones(env.action_space.n))
        exprep = ReplayMemory(1)

        # Keeps track of useful statistics
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))

        # The policy we're following
        policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

        for i_episode in range(num_episodes):

            print('episode no.', i_episode)

            # Reset the environment and pick the first action
            state = env.reset()

            # One step in the environment
            # total_reward = 0.0
            for t in itertools.count():

                # Take a step
                action_probs = policy(state)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
                tup = env.step(action)
                exprep.push(tup)

                # Update statistics
                stats.episode_rewards[i_episode] += tup[3]
                stats.episode_lengths[i_episode] = t
                if exprep.isReady():
                    B = exprep.sampleBatch(1)
                    for j in B:
                        # TD Update

                        print('Q for ', j[0], 'before update', Q[tuple(j[0])])
                        prev_state, action, next_state, reward, done = j[0], j[
                            1], j[2], j[3], j[4]
                        best_next_action = np.argmax(Q[tuple(next_state)])

                        td_target = reward + discount_factor * Q[
                            tuple(next_state
                                  )][best_next_action] if not done else reward

                        td_delta = td_target - Q[tuple(prev_state)][action]

                        Q[tuple(prev_state)][action] += alpha * td_delta

                        print('Q for ', j[0], 'after update', Q[tuple(j[0])])

                    #print('Q',Q[(1,10,1)])
                    #alpha= alpha**t

                    if tup[-1]:  #or t==step_number:
                        if tup[-1]:
                            print('found the solution', tup[2], 'prev', tup[0])
                        # z=input()
                        #print (Q)
                        break

                    state = tup[2]

        return Q, stats
Пример #17
0
def q_learning(env,
               q_estimator,
               target_estimator,
               num_episodes,
               update_target_estimator_every,
               discount_factor=1.0,
               epsilon_start=1.0,
               epsilon_end=0.1,
               epsilon_decay_steps=500000):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    batch_size = 32
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    epsilon = epsilons[0]
    Transition = namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])
    # Create a replay memory buffer.
    replay_memory = deque(maxlen=10000)
    # Fill replay buffer.
    state = env.reset()
    for i in itertools.count():
        if i % 100 == 0:
            print("Filling replay buffer... " + str(i) + "/" +
                  str(replay_memory.maxlen),
                  end="\r")
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        # Record the transition.
        replay_memory.append(
            Transition(state, action, reward, next_state, done))
        state = next_state
        if done:
            state = env.reset()
        if i >= replay_memory.maxlen:
            break
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    total_t = 0
    for i_episode in range(num_episodes):
        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        print("\rEpisode={}/{}\tTotal timesteps={}\tReward={}\tEpsilon={}".
              format(i_episode + 1, num_episodes, total_t, last_reward,
                     epsilon),
              end="")
        sys.stdout.flush()

        # Run an episode.
        state = env.reset()
        for t in itertools.count():
            # Copy target network.
            if total_t % update_target_estimator_every == 0:
                target_estimator.copy_params()
                # print("\nCopied model parameters to target network.")

            # Take action.
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]
            action_probs = np.ones(env.action_space.n,
                                   dtype=float) * epsilon / env.action_space.n
            q_values = q_estimator.predict(state)
            best_action = np.argmax(q_values)
            action_probs[best_action] += (1.0 - epsilon)
            action = np.random.choice(env.action_space.n, p=action_probs)
            next_state, reward, done, info = env.step(action)

            # Record the transition.
            replay_memory.append(
                Transition(state, action, reward, next_state, done))
            # Record stats.
            stats.episode_lengths[i_episode] = t
            stats.episode_rewards[i_episode] += reward

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                np.array, zip(*samples))
            # Calculate q values and targets
            q_values_next = target_estimator.predict_batch(next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(
                np.float32) * discount_factor * np.amax(q_values_next, axis=1)
            # Update Q function.
            states_batch = np.array(states_batch)
            q_estimator.update(states_batch, action_batch, targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        yield i_episode, total_t, stats

    return stats
Пример #18
0
def q_learning(env, num_episodes, discount_factor=1, alpha=0.5, epsilon=0.1, epsilon_decay=1.0):
    """
    Args:
        alpha: TD learning rate
    """
    height = env.unwrapped.game.height
    width = env.unwrapped.game.width
    
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))
    
    # 4 actions + 2 for X and Y
    weights = np.random.rand(6)

    for i_episode in range(num_episodes):
        print("------------------------------")

        # The policy we're following
        # policy = make_epsilon_greedy_policy(
        #     epsilon * epsilon_decay**i_episode, env.action_space.n)
        
        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        sys.stdout.flush()
        

        # Reset the env and pick the first action
        previous_state = env.reset()

        action_probs = np.ones(4, dtype=float)
        for t in range(TIME_RANGE):
            env.render()
            # time.sleep(0.1)
            # Take a step
            # action_probs = policy(state_int, i_episode)
            normalised_x = int(previous_state[0])/int(width)
            normalised_y = int(previous_state[1])/int(height)

            for i in range(0,4):
                action_probs[i] = weights[i] + normalised_x*weights[4] + normalised_y*weights[5] 
                # action_probs[i] = weights[i] + int(previous_state[0])*weights[4] + int(previous_state[1])*weights[5] 
            
            action = np.argmax(action_probs)
            
            print("action ", action)
            # import ipdb; ipdb.set_trace()
            # action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            # action = env.action_space.sample()

            # 0: UP
            # 1: DOWN
            # 2: LEFT
            # 3: RIGHT

            next_state, reward, done, _ = env.step(action)
            if done:
                reward = 100
            else:
                reward = reward - 1
            
            # Update stats
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # TD Update
            alpha = 0.01
            # v_now = weights[action] + int(previous_state[0])*weights[4] + int(previous_state[1])*weights[5]
            v_now = weights[action] + normalised_x*weights[4] + normalised_y*weights[5]
            
            normalised_next_x = int(next_state[0])/int(width)
            normalised_next_y = int(next_state[1])/int(height)

            # v_next = weights[action] + int(next_state[0])*weights[4] + int(next_state[1])*weights[5]
            v_next = weights[action] + normalised_next_x*weights[4] + normalised_next_y*weights[5]
            weights_delta = alpha*(reward + discount_factor*v_next - v_now)*weights
            print("weights_delta", weights_delta)
            weights = weights - weights_delta
            print("weights", weights)

            previous_state = next_state

            if done:
                break

        # run_experiment(env,state_int, Q, stats_test, i_episode, width, TIME_RANGE)

    return Q, stats
def Q_Sigma_Off_Policy_3_Step(env, theta, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0):

	#q-learning algorithm with linear function approximation here

	#estimator : Estimator of Q^w(s,a)	- function approximator
	stats = plotting.EpisodeStats(
		episode_lengths=np.zeros(num_episodes),
		episode_rewards=np.zeros(num_episodes))  

	alpha = 0.01


	for i_episode in range(num_episodes):

		print "Epsisode Number Off Policy Q(sigma) 3 Step", i_episode

		off_policy = behaviour_policy_epsilon_greedy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n)
		policy = make_epsilon_greedy_policy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n)

		state = env.reset()

		next_action = None


		for t in itertools.count():

			if next_action is None:
				action_probs = off_policy(state)
				action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
			else:
				action = next_action

			state_t_1, reward, done, _ = env.step(action)

			stats.episode_rewards[i_episode] += reward
			stats.episode_lengths[i_episode] = t

			if done:
				break			


			# q_values = estimator.predict(state)
			# q_values_state_action = q_values[action]
			#evaluate Q(current state, current action)
			features_state = featurize_state(state)
			q_values = np.dot(theta.T, features_state)
			q_values_state_action = q_values[action]


			#select sigma value
			probability = 0.5
			sigma_t_1 = binomial_sigma(probability)

			#select next action based on the behaviour policy at next state
			next_action_probs = off_policy(state_t_1)
			action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs)


			# q_values_t_1 = estimator.predict(state_t_1)
			# q_values_next_state_next_action = q_values_t_1[action_t_1]
			features_state_1 = featurize_state(state_t_1)
			q_values_t_1 = np.dot(theta.T, features_state_1)
			q_values_next_state_next_action = q_values_t_1[action_t_1]


			on_policy_next_action_probs = policy(state_t_1)
			on_policy_a_t_1 = np.random.choice(np.arange(len(on_policy_next_action_probs)), p = on_policy_next_action_probs)
			V_t_1 = np.sum( on_policy_next_action_probs * q_values_t_1 )

			Delta_t = reward + discount_factor * ( sigma_t_1 * q_values_next_state_next_action + (1 - sigma_t_1) * V_t_1  ) - q_values_state_action



			state_t_2, next_reward, done, _ = env.step(action_t_1)
			if done:
				break

			next_next_action_probs = off_policy(state_t_2)
			action_t_2 = np.random.choice(np.arange(len(next_next_action_probs)), p = next_next_action_probs)


			# q_values_t_2 = estimator.predict(state_t_2)
			# q_values_next_next_state_next_next_action = q_values_t_2[action_t_2]
			features_state_2 = featurize_state(state_t_2)
			q_values_t_2 = np.dot(theta.T, features_state_2)
			q_values_next_next_state_next_next_action = q_values_t_2[action_t_2]




			on_policy_next_next_action_probs = policy(state_t_2)
			on_policy_a_t_2 = np.random.choice(np.arange(len(on_policy_next_next_action_probs)), p = on_policy_next_next_action_probs)
			V_t_2 = np.sum( on_policy_next_next_action_probs * q_values_t_2  )
			
			sigma_t_2 = binomial_sigma(probability)



			Delta_t_1 = next_reward + discount_factor * (  sigma_t_2 * q_values_next_next_state_next_next_action + (1 - sigma_t_2) * V_t_2   ) - q_values_next_state_next_action


			"""
			3 step TD Target --- G_t(2)
			"""
			state_t_3, next_next_reward, done, _ = env.step(action_t_2)
			if done:
				break
			next_next_next_action_probs = off_policy(state_t_3)
			action_t_3 = np.random.choice(np.arange(len(next_next_next_action_probs)), p = next_next_next_action_probs)

			features_state_3 = featurize_state(state_t_3)
			q_values_t_3 = np.dot(theta.T,features_state_3)
			q_values_next_next_next_state_next_next_next_action = q_values_t_3[action_t_3]

			on_policy_next_next_next_action_probs = policy(state_t_3)
			on_policy_a_t_3 = np.random.choice(np.arange(len(on_policy_next_next_next_action_probs)), p = on_policy_next_next_next_action_probs)
			V_t_3 = np.sum(on_policy_next_next_next_action_probs * q_values_t_3)

			sigma_t_3 = binomial_sigma(probability)

			Delta_t_2 = next_next_reward + discount_factor * (sigma_t_3 * q_values_next_next_next_state_next_next_next_action + (1 - sigma_t_3) * V_t_3 ) -  q_values_next_next_state_next_next_action



			on_policy_action_probability = on_policy_next_action_probs[on_policy_a_t_1]
			off_policy_action_probability = next_action_probs[action_t_1]

			on_policy_next_action_probability = on_policy_next_next_action_probs[on_policy_a_t_2]
			off_policy_next_action_probability = next_next_action_probs[action_t_2]



			td_target = q_values_state_action + Delta_t + discount_factor * ( (1 - sigma_t_1) *  on_policy_action_probability + sigma_t_1 ) * Delta_t_1 + discount_factor * ( (1 - sigma_t_2)  * on_policy_next_action_probability + sigma_t_2 ) * Delta_t_2

			"""
			Computing Importance Sampling Ratio
			"""
			rho = np.divide( on_policy_action_probability, off_policy_action_probability )
			rho_1 = np.divide( on_policy_next_action_probability, off_policy_next_action_probability )

			rho_sigma = sigma_t_1 * rho + 1 - sigma_t_1
			rho_sigma_1 = sigma_t_2 * rho_1 + 1 - sigma_t_2

			all_rho_sigma = rho_sigma * rho_sigma_1

			td_error = td_target -  q_values_state_action 

			# estimator.update(state, action, new_td_target)
			theta[:, action] += alpha * all_rho_sigma * td_error * features_state

			if done:
				break

			state = state_t_1
			
	return stats
Пример #20
0
def q_learning(env,
               theta,
               num_episodes,
               discount_factor=1.0,
               alpha=0.1,
               epsilon=0.1,
               epsilon_decay=0.999):

    #q-learning algorithm with linear function approximation here

    #estimator : Estimator of Q^w(s,a)	- function approximator
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    theta = np.random.normal(size=(400, env.action_space.n))

    for i_episode in range(num_episodes):
        print "Episode Number, Q Learning:", i_episode

        #this policy here is the off policy epsilon greedy policy?
        #np.argmax(Q[next_state]) - is for the target policy pi which is
        #greedy (since maximisation over Q) wrt Q(s,a)
        policy = make_epsilon_greedy_policy(theta,
                                            epsilon * epsilon_decay**i_episode,
                                            env.action_space.n)

        #should be tau here for the Temperature - if using Boltzmann exploration policy
        # off_policy = behaviour_policy_Boltzmann(theta,epsilon * epsilon_decay**i_episode, env.action_space.n )
        state = env.reset()

        next_action = None

        #for each one step in the environment
        for t in itertools.count():
            if next_action is None:
                action_probs = policy(state)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
            else:
                action = next_action

            next_state, reward, done, _ = env.step(action)
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            #Q values for current state
            features_state = featurize_state(state)
            q_values = np.dot(theta.T, features_state)
            q_values_state_action = q_values[action]

            #next action
            #these actions should be based on off policy for Q-learning
            #taking actions according to the off policy epsilon greedy policy
            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)

            #next state features and Q(s', a')
            next_features_state = featurize_state(next_state)
            q_values_next = np.dot(theta.T, next_features_state)
            q_values_next_state_next_action = q_values_next[next_action]

            # OR : np.max(q_values_next)
            #this is for the target policy pio
            #which is greedy wrt Q(s,a)
            best_next_action = np.argmax(q_values_next)
            td_target = reward + discount_factor * q_values_next[
                best_next_action]

            td_error = td_target - q_values_state_action

            theta[:, action] += alpha * td_error * features_state

            if done:
                break
            state = next_state
    return stats
Пример #21
0
def deep_q_learning(sess,
                  env,
                  q_estimator,
                  target_estimator,
                  state_processor,
                  num_episodes,
                  experiment_dir,
                  replay_memory_size=500000,
                  replay_memory_init_size=50000,
                  update_target_estimator_every=10000,
                  discount_factor=0.99,
                  epsilon_start=1.0,
                  epsilon_end=0.1,
                  epsilon_decay_steps=500000,
                  batch_size=32,
                  record_video_every=50):
  """
  Q-Learning algorithm for off-policy TD control using Function Approximation.
  Finds the optimal greedy policy while following an epsilon-greedy policy.

  Args:
      sess: Tensorflow Session object
      env: OpenAI environment
      q_estimator: Estimator object used for the q values
      target_estimator: Estimator object used for the targets
      state_processor: A StateProcessor object
      num_episodes: Number of episodes to run for
      experiment_dir: Directory to save Tensorflow summaries in
      replay_memory_size: Size of the replay memory
      replay_memory_init_size: Number of random experiences to sample when initializing
        the reply memory.
      update_target_estimator_every: Copy parameters from the Q estimator to the
        target estimator every N steps
      discount_factor: Gamma discount factor
      epsilon_start: Chance to sample a random action when taking an action.
        Epsilon is decayed over time and this is the start value
      epsilon_end: The final minimum value of epsilon after decaying is done
      epsilon_decay_steps: Number of steps to decay epsilon over
      batch_size: Size of batches to sample from the replay memory
      record_video_every: Record a video every N episodes

  Returns:
      An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
  """

  Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

  # The replay memory
  replay_memory = []

  # Make model copier object
  estimator_copy = ModelParametersCopier(q_estimator, target_estimator)

  # Keeps track of useful statistics
  stats = plotting.EpisodeStats(
      episode_lengths=np.zeros(num_episodes),
      episode_rewards=np.zeros(num_episodes))

  # For 'system/' summaries, usefull to check if currrent process looks healthy
  current_process = psutil.Process()

  # Create directories for checkpoints and summaries
  checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
  checkpoint_path = os.path.join(checkpoint_dir, "model")
  monitor_path = os.path.join(experiment_dir, "monitor")

  if not os.path.exists(checkpoint_dir):
      os.makedirs(checkpoint_dir)
  if not os.path.exists(monitor_path):
      os.makedirs(monitor_path)

  saver = tf.train.Saver()
  # Load a previous checkpoint if we find one
  latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
  if latest_checkpoint:
      print("Loading model checkpoint {}...\n".format(latest_checkpoint))
      saver.restore(sess, latest_checkpoint)

  # Get the current time step
  # total_t = sess.run(tf.contrib.framework.get_global_step())
  total_t = sess.run(tf.train.get_global_step())

  # The epsilon decay schedule
  epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

  # The policy we're following
  policy = make_epsilon_greedy_policy(
      q_estimator,
      len(VALID_ACTIONS))

  # Populate the replay memory with initial experience
  print("Populating replay memory...")
  state = env.reset()
  state = state_processor.process(sess, state)
  state = np.stack([state] * 4, axis=2)
  for i in range(replay_memory_init_size):
      action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
      action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
      next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
      next_state = state_processor.process(sess, next_state)
      next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
      replay_memory.append(Transition(state, action, reward, next_state, done))
      if done:
          state = env.reset()
          state = state_processor.process(sess, state)
          state = np.stack([state] * 4, axis=2)
      else:
          state = next_state


  # Record videos
  # Add env Monitor wrapper
  env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True)

  for i_episode in range(num_episodes):

      # Save the current checkpoint
      saver.save(tf.get_default_session(), checkpoint_path)

      # Reset the environment
      state = env.reset()
      state = state_processor.process(sess, state)
      state = np.stack([state] * 4, axis=2)
      loss = None

      # One step in the environment
      for t in itertools.count():

          # Epsilon for this time step
          epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

          # Maybe update the target estimator
          if total_t % update_target_estimator_every == 0:
              estimator_copy.make(sess)
              print("\nCopied model parameters to target network.")

          # Print out which step we're on, useful for debugging.
          print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                  t, total_t, i_episode + 1, num_episodes, loss), end="")
          sys.stdout.flush()

          # Take a step
          action_probs = policy(sess, state, epsilon)
          action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
          next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
          next_state = state_processor.process(sess, next_state)
          next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

          # If our replay memory is full, pop the first element
          if len(replay_memory) == replay_memory_size:
              replay_memory.pop(0)

          # Save transition to replay memory
          replay_memory.append(Transition(state, action, reward, next_state, done))

          # Update statistics
          stats.episode_rewards[i_episode] += reward
          stats.episode_lengths[i_episode] = t

          # Sample a minibatch from the replay memory
          samples = random.sample(replay_memory, batch_size)
          states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

          # Calculate q values and targets
          q_values_next = target_estimator.predict(sess, next_states_batch)
          targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)

          # Perform gradient descent update
          states_batch = np.array(states_batch)
          loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)

          if done:
              break

          state = next_state
          total_t += 1

      # Add summaries to tensorboard
      episode_summary = tf.Summary()
      episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon")
      episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag="episode/reward")
      episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag="episode/length")
      episode_summary.value.add(simple_value=current_process.cpu_percent(), tag="system/cpu_usage_percent")
      episode_summary.value.add(simple_value=current_process.memory_percent(memtype="vms"), tag="system/v_memeory_usage_percent")
      q_estimator.summary_writer.add_summary(episode_summary, i_episode)
      q_estimator.summary_writer.flush()

      yield total_t, plotting.EpisodeStats(
          episode_lengths=stats.episode_lengths[:i_episode+1],
          episode_rewards=stats.episode_rewards[:i_episode+1])

  return stats
Пример #22
0
def sarsa(env,
          estimator,
          num_episodes,
          discount_factor=1.0,
          alpha=0.1,
          epsilon=0.1,
          epsilon_decay=1.0):
    #estimator : Estimator of Q^w(s,a)	- function approximator
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    theta = np.random.normal(size=(400, env.action_space.n))

    for i_episode in range(num_episodes):
        print "Episode Number, SARSA:", i_episode
        #agent policy based on the greedy maximisation of Q

        policy = make_epsilon_greedy_policy(theta,
                                            epsilon * epsilon_decay**i_episode,
                                            env.action_space.n)

        state = env.reset()

        action_probs = policy(state)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_action = None

        #for each one step in the environment
        for t in itertools.count():

            next_state, reward, done, _ = env.step(action)
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            #Q values for current state
            features_state = featurize_state(state)
            q_values = np.dot(theta.T, features_state)
            q_values_state_action = q_values[action]

            #next action
            #these actions should be based on off policy for Q-learning
            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)

            #next state features and Q(s', a')
            next_features_state = featurize_state(next_state)
            q_values_next = np.dot(theta.T, next_features_state)
            q_values_next_state_next_action = q_values_next[next_action]

            td_target = reward + discount_factor * q_values_next_state_next_action

            td_error = td_target - q_values_state_action

            theta[:, action] += alpha * td_error * features_state

            if done:
                break

            state = next_state
            action = next_action

    return stats
def three_step_tree_backup(env,
                           num_episodes,
                           discount_factor=1.0,
                           alpha=0.5,
                           epsilon=0.1):

    #Expected SARSA : same algorithm steps as Q-Learning,
    # only difference : instead of maximum over next state and action pairs
    # use the expected value
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        state = env.reset()

        #steps within each episode
        for t in itertools.count():
            #pick the first action
            #choose A from S using policy derived from Q (epsilon-greedy)
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, _, _ = env.step(action)

            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)
            next_next_state, next_reward, _, _ = env.step(next_action)

            next_next_action_probs = policy(next_next_state)
            next_next_action = np.random.choice(np.arange(
                len(next_next_action_probs)),
                                                p=next_next_action_probs)
            next_next_next_state, next_next_reward, done, _ = env.step(
                next_next_action)

            next_next_next_action_probs = policy(next_next_next_state)
            next_next_next_action = np.random.choice(
                np.arange(len(next_next_next_action_probs)),
                p=next_next_next_action_probs)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            #updates for the Three Step Tree Backup

            #V = sum_a pi(a, s_{t+1})Q(s_{t+1}, a)
            V = np.sum(next_action_probs * Q[next_state])

            One_Step = reward + discount_factor * V

            next_V = np.sum(next_next_action_probs * Q[next_next_state])
            Delta_1 = next_reward + discount_factor * next_V - Q[next_state][
                next_action]
            next_action_selection_probability = np.max(next_action_probs)

            Two_Step = discount_factor * next_action_selection_probability * Delta_1

            next_next_V = np.sum(next_next_next_action_probs *
                                 Q[next_next_next_state])
            Delta_2 = next_next_reward + discount_factor * next_next_V - Q[
                next_next_state][next_next_action]
            next_next_action_selection_probability = np.max(next_next_action)

            Three_Step = discount_factor * next_action_selection_probability * discount_factor * next_next_action_selection_probability * Delta_2

            td_target = One_Step + Two_Step + Three_Step

            td_delta = td_target - Q[state][action]

            Q[state][action] += alpha * td_delta

            if done:
                break

            state = next_state

    return Q, stats
def q_learning(env,
               num_episodes,
               discount_factor=0.9,
               alpha=0.8):  # , epsilon=0.1):
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
    while following an epsilon-greedy policy
    
    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float between 0 and 1.
    
    Returns:
        A tuple (Q, episode_lengths).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    memory = defaultdict(tuple)

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # The policy we're following
    # policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in tqdm(range(num_episodes)):
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes),
                  end="")

        # Reset the environment and pick the first action
        eigen_state = env.reset()

        # One step in the environment
        # total_reward = 0.0
        for t in itertools.count():
            if eigen_state in memory:
                mem_list = memory[eigen_state]
                action = mem_list[0]
                state_value = mem_list[1]
                next_state = mem_list[2]

                if next_state in memory:
                    next_state_value = memory[next_state][1]
                else:
                    next_state_value = 0.0
                reward = mem_list[3]

                circuit = QuantumCircuit(2, 2)
                circuit.h(0)
                circuit.h(1)
                circuit = groverIteration(circuit, action, reward,
                                          next_state_value)
            else:
                # Prepare the n-qubit registers
                circuit = QuantumCircuit(2, 2)
                circuit.h(0)
                circuit.h(1)
                state_value = 0.0

            action = collapse_action_select_method(circuit)
            next_eigen_state, reward, done = env.step(action)

            if next_eigen_state in memory:
                mem_list = memory[next_eigen_state]
                next_state_value = mem_list[1]
            else:
                next_state_value = 0.0

            # Update state value
            state_value = state_value + alpha * (
                reward + (discount_factor * next_state_value) - state_value)
            # print(state_value)

            memory[eigen_state] = (action, state_value, next_eigen_state,
                                   reward)

            stats.episode_rewards[i_episode] += (discount_factor**t) * reward
            stats.episode_lengths[i_episode] = t

            if done:
                break

            # state = next_state
            eigen_state = next_eigen_state

    return Q, stats, memory
def qlearning_alpha_e_greedy(env, n_episodes=2000, gamma=0.99, alpha=0.85, best_enabled=False):
    nS = env.observation_space.n
    nA = env.action_space.n

    if best_enabled:
        # record your best-tuned hyperparams here
        env.seed(0)
        np.random.seed(0)
        alpha = 0.05
        gamma = 0.99
        epsilon_decay = 0.95
        e = 1.0

    Q = np.zeros([nS, nA])
    policy = make_decay_e_greedy_policy(Q, nA)

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(n_episodes),
        episode_rewards=np.zeros(n_episodes))

    for i in range(n_episodes):
        # useful for debuggin
        log_episode(i, n_episodes)

        s = env.reset()
        done = False
        total_reward = 0

        if best_enabled:
            e *= epsilon_decay
        else:
            e = 1.0 /((i/10) + 1.0)

        for t in itertools.count():
            # Choose action by decaying e-greedy
            probs = policy(s, e)
            a = np.random.choice(np.arange(nA), p=probs)
            # take a step
            next_s, r, done, _ = env.step(a)

            if best_enabled:
                mod_r = modify_reward(r, done)
                td_target = mod_r + gamma * np.max(Q[next_s, :])
            else:
                td_target = r + gamma * np.max(Q[next_s, :])

            td_delta = td_target - Q[s, a]
            Q[s, a] += alpha * td_delta

            s = next_s
            total_reward += r

            if done:
                break

        # Update statistics
        stats.episode_rewards[i] += total_reward
        stats.episode_lengths[i] = t

    return Q, stats
Пример #26
0
                           eps=dropout_prob)
    memory = ReplayMemory(max_size=100000)

    print('Experiment Number ', e)

    loss_per_ep = []
    w1_m_per_ep = []
    w2_m_per_ep = []
    w3_m_per_ep = []
    total_reward = []

    ep = 0
    avg_Rwd = -np.inf
    episode_end_msg = 'loss={:2.10f}, w1_m={:3.1f}, w2_m={:3.1f}, w3_m={:3.1f}, total reward={}'

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(max_n_ep),
                                  episode_rewards=np.zeros(max_n_ep))
    """
    Loop for the number of episodes : For every episode
    """
    while avg_Rwd < min_avg_Rwd and ep < max_n_ep:

        if ep >= n_avg_ep:
            avg_Rwd = np.mean(total_reward[ep - n_avg_ep:ep])
            print("EPISODE {}. Average reward over the last {} episodes: {}.".
                  format(ep, n_avg_ep, avg_Rwd))
        else:
            print("EPISODE {}.".format(ep))
        """
        Contains loop for every step within the episode
        """
        loss_v, w1_m, w2_m, w3_m, cum_R, step_length, variance_steps = run_episode(
def three_step_tree_backup(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):


	Q = defaultdict(lambda : np.zeros(env.action_space.n))
	stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),episode_rewards=np.zeros(num_episodes))  

	for i_episode in range(num_episodes):

		print "Episode Number, Three Step Tree Backup:", i_episode
		#agent policy based on the greedy maximisation of Q
		policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
		last_reward = stats.episode_rewards[i_episode - 1]
		state = env.reset()



		#for each one step in the environment
		for t in itertools.count():

			action_probs = policy(state)
			action = np.random.choice(np.arange(len(action_probs)), p=action_probs)


			next_state, reward, done, _ = env.step(action)
			if done:
				break
			
			stats.episode_rewards[i_episode] += reward
			stats.episode_lengths[i_episode] = t

			next_action_probs = policy(next_state)
			next_action = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs)
			
			V = np.sum( next_action_probs * Q[next_state])

			Delta = reward + discount_factor * V - Q[state][action]


			next_next_state, next_reward, _, _ = env.step(next_action)
			next_next_action_probs = policy(next_next_state)
			next_next_action = np.random.choice(np.arange(len(next_next_action_probs)), p = next_next_action_probs)

			next_V = np.sum(next_next_action_probs * Q[next_next_state])		
			
			Delta_t_1 = next_reward + discount_factor * next_V - Q[next_state][next_action]

			next_next_next_state, next_next_reward, _, _ = env.step(next_next_action)
			next_next_next_action_probs = policy(next_next_next_state)
			next_next_next_action = np.random.choice(np.arange(len(next_next_next_action_probs)), p = next_next_next_action_probs)

			next_next_V = np.sum(next_next_next_action_probs * Q[next_next_next_state])

			Delta_t_2 = next_next_reward + discount_factor * next_next_V - Q[next_next_state][next_next_action]

			next_action_selection_probability = np.max(next_action_probs)
			next_next_action_selection_probability = np.max(next_next_action_probs)

			td_target = Q[state][action] + Delta + discount_factor * next_action_selection_probability * Delta_t_1 + discount_factor * discount_factor * next_action_selection_probability * next_next_action_selection_probability * Delta_t_2			
			td_delta = td_target - Q[state][action]

			Q[state][action] += alpha * td_delta			
			state = next_state

	return stats
def Q_Sigma_Off_Policy(env, theta, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=0.99):

	#q-learning algorithm with linear function approximation here

	#estimator : Estimator of Q^w(s,a)	- function approximator
	stats = plotting.EpisodeStats(
		episode_lengths=np.zeros(num_episodes),
		episode_rewards=np.zeros(num_episodes)) 
	cumulative_errors = np.zeros(shape=(num_episodes, 1)) 

	alpha = 0.01
	tau=1

  
	for i_episode in range(num_episodes):
		#state_count=np.zeros(shape=(env.observation_space.n,1))

		print ("Epsisode Number Off Policy Q(sigma)", i_episode)

		off_policy = behaviour_policy_Boltzmann(theta, tau, env.action_space.n)
		policy = make_epsilon_greedy_policy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n)

		state = env.reset()
		next_action = None


		for t in itertools.count():

			if next_action is None:
				action_probs = policy(state)
				action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
			else:
				action = next_action

			state_t_1, reward, done, _ = env.step(action)

			if done:
				break			

			stats.episode_rewards[i_episode] += reward
			stats.episode_lengths[i_episode] = t



			# q_values = estimator.predict(state)
			# q_values_state_action = q_values[action]
			#evaluate Q(current state, current action)
			features_state = featurize_state(state)
			q_values = np.dot(theta.T, features_state)
			q_values_state_action = q_values[action]



			#select sigma value
			sigma_t_1=binomial_sigma(0.5)


			#select next action based on the behaviour policy at next state
			next_action_probs = off_policy(state_t_1)
			action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs)


			# q_values_t_1 = estimator.predict(state_t_1)
			# q_values_next_state_next_action = q_values_t_1[action_t_1]
			features_state_1 = featurize_state(state_t_1)
			q_values_t_1 = np.dot(theta.T, features_state_1)
			q_values_next_state_next_action = q_values_t_1[action_t_1]


			V_t_1 = np.sum( next_action_probs * q_values_t_1 )

			Delta_t = reward + discount_factor * ( sigma_t_1 * q_values_next_state_next_action + (1 - sigma_t_1) * V_t_1  ) - q_values_state_action


			"""
			target for one step
			1 step TD Target --- G_t(1)
			"""
			td_target = q_values_state_action + Delta_t 

			td_error = td_target -  q_values_state_action 

			# estimator.update(state, action, new_td_target)
			theta[:, action] += alpha * td_error * features_state
			rms_error = np.sqrt(np.sum((td_error)**2))
			cumulative_errors[i_episode, :] += rms_error

			state = state_t_1

	return stats,cumulative_errors
def Q_Sigma_Off_Policy_2_Step(env, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1):

	Q = defaultdict(lambda : np.zeros(env.action_space.n))
	stats = plotting.EpisodeStats(
		episode_lengths=np.zeros(num_episodes),
		episode_rewards=np.zeros(num_episodes))  


	tau = 1
	tau_decay = 0.999

	sigma = 1
	sigma_decay = 0.995

	for i_episode in range(num_episodes):

		print "Number of Episodes, Q(sigma) Off Policy 2 Step", i_episode

		policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
		off_policy = behaviour_policy_epsilon_greedy(Q, tau, env.action_space.n)

		tau = tau * tau_decay

		if tau < 0.0001:
			tau = 0.0001

		state = env.reset()

		for t in itertools.count():
			action_probs = off_policy(state)
			action = np.random.choice(np.arange(len(action_probs)), p = action_probs)

			state_t_1, reward, done, _ = env.step(action)

			stats.episode_rewards[i_episode] += reward
			stats.episode_lengths[i_episode] = t

			if done:
				sigma = sigma * sigma_decay
				if sigma < 0.0001:
					sigma = 0.0001
				break		

			# probability = 0.5
			# sigma_t_1 = binomial_sigma(probability)

			sigma_t_1 = sigma

			next_action_probs = off_policy(state_t_1)
			action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs)

			on_policy_next_action_probs = policy(state_t_1)
			on_policy_a_t_1 = np.random.choice(np.arange(len(on_policy_next_action_probs)), p = on_policy_next_action_probs)
			V_t_1 = np.sum( on_policy_next_action_probs * Q[state_t_1] )

			Delta_t = reward + discount_factor * (  sigma_t_1 * Q[state_t_1][action_t_1] + (1 - sigma_t_1) * V_t_1  ) - Q[state][action]


			state_t_2, next_reward, _, _ = env.step(action_t_1)

			next_next_action_probs = off_policy(state_t_2)
			action_t_2 = np.random.choice(np.arange(len(next_next_action_probs)), p = next_next_action_probs)

			on_policy_next_next_action_probs = policy(state_t_2)
			on_policy_a_t_2 = np.random.choice(np.arange(len(on_policy_next_next_action_probs)), p = on_policy_next_next_action_probs)
			V_t_2 = np.sum( on_policy_next_next_action_probs * Q[state_t_2])

			sigma_t_2 = sigma

			Delta_t_1 = next_reward + discount_factor * (  sigma_t_2 * Q[state_t_2][action_t_2] + (1 - sigma_t_2) * V_t_2  ) - Q[state_t_1][action_t_1]


			"""
			2 step TD Target --- G_t(2)
			"""
			
			on_policy_action_probability = on_policy_next_action_probs[on_policy_a_t_1]
			off_policy_action_probability = next_action_probs[action_t_1]

			td_target = Q[state][action] + Delta_t + discount_factor * ( (1 - sigma_t_1) *  on_policy_action_probability + sigma_t_1 ) * Delta_t_1

			"""
			Computing Importance Sampling Ratio
			"""
			rho = np.divide( on_policy_action_probability, off_policy_action_probability )
			rho_sigma = sigma_t_1 * rho + 1 - sigma_t_1

			td_error = td_target - Q[state][action]

			Q[state][action] += alpha * rho_sigma * td_error

			state = state_t_1

	return stats
Пример #30
0
def double_qNetwork(env,
                    n_episodes=3000,
                    gamma=0.99,
                    alpha=0.85,
                    best_enabled=False,
                    log_by_step=False,
                    result_per_episode=100,
                    network_type='LR'):
    nS = env.observation_space.n
    nA = env.action_space.n
    hidden_size = 30
    subRewardList = []
    avgRewardList = []

    def one_hot(x):
        return np.identity(nS)[x:x + 1]

    if best_enabled:
        # record your best-tuned hyperparams here
        env.seed(0)
        np.random.seed(0)
        alpha = 0.003
        gamma = 0.99
        epsilon_decay = 0.95
        e = 1.0

    X = tf.placeholder(shape=[1, nS], dtype=tf.float32)
    Y = tf.placeholder(shape=[1, nA], dtype=tf.float32)

    if network_type == 'NN':
        W1_1 = tf.get_variable(
            "W1_1",
            shape=[nS, hidden_size],
            initializer=tf.contrib.layers.xavier_initializer())
        Z1_1 = tf.matmul(X, W1_1)
        Z1_1 = tf.nn.tanh(Z1_1)
        W2_1 = tf.get_variable(
            "W2_1",
            shape=[hidden_size, nA],
            initializer=tf.contrib.layers.xavier_initializer())
        Qpred_1 = tf.matmul(Z1_1, W2_1)

        W1_2 = tf.get_variable(
            "W1_2",
            shape=[nS, hidden_size],
            initializer=tf.contrib.layers.xavier_initializer())
        Z1_2 = tf.matmul(X, W1_2)
        Z1_2 = tf.nn.tanh(Z1_2)
        W2_2 = tf.get_variable(
            "W2_2",
            shape=[hidden_size, nA],
            initializer=tf.contrib.layers.xavier_initializer())
        Qpred_2 = tf.matmul(Z1_2, W2_2)

    else:  # network_type == 'LR':  (Logistic Regression)
        W_1 = tf.Variable(tf.random_uniform([nS, nA], 0, 0.01))
        W_2 = tf.Variable(tf.random_uniform([nS, nA], 0, 0.01))
        Qpred_1 = tf.matmul(X, W_1)
        Qpred_2 = tf.matmul(X, W_2)

    loss_1 = tf.reduce_sum(tf.square(Y - Qpred_1))
    train_1 = tf.train.GradientDescentOptimizer(
        learning_rate=alpha).minimize(loss_1)
    loss_2 = tf.reduce_sum(tf.square(Y - Qpred_2))
    train_2 = tf.train.GradientDescentOptimizer(
        learning_rate=alpha).minimize(loss_2)

    init = tf.global_variables_initializer()

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(n_episodes),
                                  episode_rewards=np.zeros(n_episodes))

    with tf.Session() as sess:
        sess.run(init)

        for i in range(n_episodes):
            s = env.reset()
            total_reward = 0
            e = 1. / ((i / 10) + 10)  # 실행하면서 낮춰주도록 함

            done = False
            count = 0

            Qpred_update = None
            Qpred_other = None
            train = None

            for t in itertools.count():

                # With 0.5 probability
                if np.random.choice(2) == 1:
                    Qpred_update = Qpred_1
                    Qpred_other = Qpred_2
                    train = train_1
                else:
                    Qpred_update = Qpred_2
                    Qpred_other = Qpred_1
                    train = train_2

                Qs = sess.run(Qpred_update, feed_dict={X: one_hot(s)})

                if log_by_step:
                    print(Qs)

                if np.random.rand(1) < e:
                    a = env.action_space.sample()
                else:
                    a = np.argmax(Qs)

                s1, reward, done, _ = env.step(a)

                if log_by_step:
                    print(
                        'step %d, curr state : %d, action : %d, next state : %d, reward : %d'
                        % (t, s, a, s1, reward))

                if best_enabled:
                    mod_reward = modify_reward(reward, done)

                    if done:
                        Qs[0, a] = mod_reward
                    else:
                        Qs1 = sess.run(Qpred_other, feed_dict={X: one_hot(s1)})
                        Qs[0, a] = mod_reward + gamma * np.max(Qs1)
                else:
                    if done:
                        Qs[0, a] = reward
                    else:
                        Qs1 = sess.run(Qpred_other, feed_dict={X: one_hot(s1)})
                        Qs[0, a] = reward + gamma * np.max(Qs1)

                sess.run(train, feed_dict={X: one_hot(s), Y: Qs})

                total_reward += reward
                s = s1
                count += 1

                if done:
                    break

            subRewardList.append(total_reward)

            if (i + 1) % result_per_episode == 0:
                avg = sum(subRewardList) / result_per_episode
                avgRewardList.append(avg)
                print(i + 1, ' episode =', total_reward, ', avg =', avg)
                subRewardList = []

            # Update statistics
            stats.episode_rewards[i] += total_reward
            stats.episode_lengths[i] = t

    return stats, subRewardList, avgRewardList