Пример #1
0
    def run_iterations(worker_num=0):
        print(f'#{worker_num}: Running {EPOCHS} iterations in worker')
        for i in range(EPOCHS):
            Q_copy = Q.copy()  # do not lock, just evaluate on a recent copy
            if i % period == 0:
                print(f'#{worker_num}: Evaluating agent on {i} iteration...')
                fitness = evaluate_q(env, Q_copy)
                print(f'#{worker_num}: Current fitness: {fitness:.2f}')
                data.append(fitness)

            # reset env for each epoch
            agent = PolicyBasedTrader(policy=None, env=env)
            s = 0  # starting state
            print(f'#{worker_num}: Rollout for epoch {i}')
            while s is not None:  # rollout
                # do not allow other threads to update Q within a single step
                with lock:
                    a = get_next_action(agent, Q, s, eps)

                    r, s_ = agent.take_action(a, s)
                    # maximize Q for the next state
                    max_q = maximize_q(agent, Q, s_)

                    Q[s, a] = alpha * (r + gamma * max_q - Q[s, a])

                s = s_
Пример #2
0
    def run_iterations():
        nonlocal eps, best_fitness
        print(f'Running {EPOCHS} epochs\n')
        for i in range(EPOCHS):
            if i % period == 0:
                print(f'Evaluating agent on {i} iteration...')
                fitness = evaluate_q(env, Q)
                if fitness > 0:
                    click.secho(f'We have positive fitness {fitness:.2f}',
                                fg='red')
                if fitness > best_fitness:
                    best_fitness = fitness
                data.append(fitness)

            # reset env for each epoch
            agent = PolicyBasedTrader(policy=None, env=env)
            s = 0  # starting state
            a = get_next_action(agent, Q, s, eps=eps)
            print(f'Rollout for epoch {i}')
            while s is not None:  # rollout
                r, s_ = agent.take_action(a, s)
                if s_ is not None:
                    a_ = get_next_action(agent, Q, s_, eps=eps)
                    q_update = alpha * (r + gamma * Q[s_, a_] - Q[s, a])
                else:
                    q_update = alpha * (r - Q[s, a])
                    a_ = None

                Q[s, a] += q_update

                s = s_
                a = a_
            eps = min_eps + (max_eps - min_eps) * np.exp(-decay_rate * i)
Пример #3
0
def evaluate_q(env, Q):
    agent = PolicyBasedTrader(policy=None, env=env)
    policy = extract_policy(agent, Q)

    for step in range(env.size):
        state = agent.to_state(step, agent.amount_usd)
        action = policy[state]
        agent.take_action(action, state)

    return agent.profit
Пример #4
0
def policy_iteration():
    env = Environment()
    env.load(2018)
    # env.load_demo()
    agent = PolicyBasedTrader(policy=None, env=env)
    s_S = agent.states_space_size
    s_A = agent.actions_space_size

    v = load_model()
    v = v if v is not None else np.zeros(s_S)  # value function
    p = np.full(s_S, IDLE_ACTION_INDEX)  # initial policy should be valid
    gamma = 1  # discount factor

    EPOCHS = 1000
    period = 5
    data = []

    print(f'States space size is {s_S}')
    print(f'Actions space size is {s_A}')
    print(f'Max epochs to run {EPOCHS}')

    theta = 0.05  # convergence check
    t1 = timeit.default_timer()
    for i in range(EPOCHS):
        t2 = timeit.default_timer()
        dt = format_timespan(t2 - t1)
        sys.stdout.write(f'\rIteration {i}/{EPOCHS}... {dt} passed')
        sys.stdout.flush()
        while True:  # policy evaluation
            delta = 0
            for s in range(s_S):
                v_ = v[s]
                v[s] = get_new_value_for_state(agent, s, v, p, gamma=gamma)
                delta = max(delta, np.abs(v_ - v[s]))

            # print(delta)
            if delta < theta:
                break

        policy_stable = True
        # policy improvement
        for s in range(s_S):
            action = p[s]
            p[s] = get_max_action_for_state(agent, s, v, gamma=gamma)
            if action != p[s]:
                policy_stable = False

        if i % period == 0:
            save_model(v)

        if policy_stable:
            print(f'\nFound stable policy on iteration {i}!')
            print(
                '\nSaving resulting model to a file {}'.format(MODEL_FILENAME))
            save_model(v)
            break

    print_value_function(v)

    return p
Пример #5
0
def main():
    v = load_model()
    env = Environment()
    env.load(2018)
    # env.load_demo()
    agent = PolicyBasedTrader(policy=None, env=env, verbose=False)
    print(f'Total states: {agent.states_space_size}')
    # policy = extract_policy(agent, v)
    with open(POLICY_FILENAME, 'rb') as f:
        policy = np.load(f)

    print('Count actions')
    c = Counter()
    agent = PolicyBasedTrader(policy=None, env=env, verbose=True)

    min_amount_uah = 0
    for step in range(env.size):
        state = agent.to_state(step, agent.amount_usd)
        action = policy[state]
        c[action] += 1
        agent.take_action(action, state)
        min_amount_uah = min(agent.amount_uah, min_amount_uah)

    for i, action in enumerate(ACTIONS):
        print(action, '->', c.get(i))

    print('min amount uah', min_amount_uah)
    print(c)
Пример #6
0
def evaluate_policy(policy):
    env = Environment()
    env.load(2018)
    # env.load_demo()
    agent = PolicyBasedTrader(policy=None, env=env, verbose=True)

    for step in range(env.size):
        state = agent.to_state(step, agent.amount_usd)
        action = policy[state]
        agent.take_action(action, state)

    print('End amount UAH: {:.2f}'.format(agent.amount_uah))
    print('End amount USD: {:.2f}'.format(agent.amount_usd))
    print('Profit in UAH: {:.2f}'.format(agent.profit))
    exit_uah = agent.amount_usd * env.get_observation(env.size - 1).rate_buy
    exit_amount = agent.amount_uah + exit_uah
    print('Amount on exit now: {:.2f}'.format(exit_amount))
    return agent.profit
Пример #7
0
def value_iteration(plot_chart=False):
    env = Environment()
    env.load(2018)
    # env.load_demo()
    agent = PolicyBasedTrader(policy=None, env=env)
    s_S = agent.states_space_size
    s_A = agent.actions_space_size

    v = load_model()
    v = v if v is not None else np.zeros(s_S)
    gamma = 1  # undiscounted return for the whole episode

    EPOCHS = 20
    period = 5
    data = []

    print(f'States space size is {s_S}')
    print(f'Actions space size is {s_A}')
    print(f'Max epochs to run {EPOCHS}')

    theta = 0.05  # convergence check
    t1 = timeit.default_timer()
    for i in range(EPOCHS):
        delta = 0
        t2 = timeit.default_timer()
        dt = format_timespan(t2 - t1)
        sys.stdout.write(f'\rIteration {i}/{EPOCHS}... {dt} passed')
        sys.stdout.flush()
        for s in range(s_S):
            v_ = v[s]
            actions_outcomes = get_outcomes_for_state(agent, s, v, gamma=gamma)
            v[s] = max(actions_outcomes)
            delta = max(delta, np.abs(v_ - v[s]))

        if i % period == 0:
            save_model(v)

        if delta < theta:
            print(f'\nValue function converged in {i} iterations')
            print(
                '\nSaving resulting model to a file {}'.format(MODEL_FILENAME))
            save_model(v)
            break

    print_value_function(v)

    print('=' * 80)
    print('Extracting deterministic policy, pi')
    policy = extract_policy(agent, v)
    print(policy)

    return policy
Пример #8
0
    def run_iterations(worker_num=0):
        # print(f'#{worker_num}: Running {EPOCHS} iterations in worker')
        nonlocal eps
        progress = tqdm.tqdm(
            desc='#{:02d}'.format(worker_num),
            position=worker_num,
            total=EPOCHS,
            leave=False,
        )
        for i in range(EPOCHS):
            Q_copy = Q.copy()  # do not lock, just evaluate on a recent copy
            if i % period == 0:
                # print(f'#{worker_num}: Evaluating agent on {i} iteration...')
                fitness = evaluate_q(env, Q_copy)
                data.append(fitness)

            # reset env for each epoch
            agent = PolicyBasedTrader(policy=None, env=env)
            s = 0  # starting state
            a = get_next_action(agent, Q_copy, s, eps=eps)
            # print(f'#{worker_num}: Rollout for epoch {i}')
            while s is not None:  # rollout
                r, s_ = agent.take_action(a, s)
                with lock:
                    if s_ is not None:
                        a_ = get_next_action(agent, Q, s_, eps=eps)
                        q_update = alpha * (r + gamma*Q[s_, a_] - Q[s, a])
                    else:
                        q_update = alpha * (r - Q[s, a])
                        a_ = None

                    Q[s, a] += q_update

                s = s_
                a = a_
            eps = min_eps + (max_eps - min_eps)*np.exp(-decay_rate*i)
            progress.update()
        progress.close()
        return worker_num
Пример #9
0
def evaluate_agent():
    env = Environment()
    env.load(2018)
    agent = PolicyBasedTrader(policy=None, env=env)
    model = load_model()
    if model is None:
        raise RuntimeError('Train agent first, no model to load')

    policy = extract_policy(agent, model)

    for step in range(env.size):
        state = agent.to_state(step, agent.amount_usd)
        action = policy[state]
        agent.take_action(action, state)

    print('End amount UAH: {:.2f}'.format(agent.amount_uah))
    print('End amount USD: {:.2f}'.format(agent.amount_usd))
    print('Profit in UAH: {:.2f}'.format(agent.profit))
    exit_uah = agent.amount_usd * env.get_observation(env.size - 1).rate_buy
    exit_amount = agent.amount_uah + exit_uah
    print('Amount on exit now: {:.2f}'.format(exit_amount))
    return agent.profit
Пример #10
0
def sarsa(play=False, plot_chart=False):
    env = Environment()
    # load smaller environment for just one month
    env.load(2018)
    agent = PolicyBasedTrader(policy=None, env=env)
    s_S = agent.states_space_size
    s_A = agent.actions_space_size
    print(f'States space size is {s_S}')
    print(f'Actions space size is {s_A}')
    print(f'Steps in environment is {env.size}')

    alpha = 1  # learning rate, discard old results immediately
    gamma = 1  # discount factor
    # load model from a file if saved previously
    model = SarsaModel.load() if play else None
    Q = model.Q if model is not None else np.zeros(shape=(s_S, s_A))
    if model is not None:
        print(f'Resuming with eps={model.eps}')

    min_eps = 0.01
    # eps = 0.1  # start with exploration
    eps = model.eps if model is not None else 0.1
    max_eps = 1.0
    decay_rate = 0.01

    EPOCHS = 100
    period = 5
    data = []

    print(f'Running {EPOCHS} epochs\n')
    lock = Lock()

    def run_iterations(worker_num=0):
        # print(f'#{worker_num}: Running {EPOCHS} iterations in worker')
        nonlocal eps
        progress = tqdm.tqdm(
            desc='#{:02d}'.format(worker_num),
            position=worker_num,
            total=EPOCHS,
            leave=False,
        )
        for i in range(EPOCHS):
            Q_copy = Q.copy()  # do not lock, just evaluate on a recent copy
            if i % period == 0:
                # print(f'#{worker_num}: Evaluating agent on {i} iteration...')
                fitness = evaluate_q(env, Q_copy)
                data.append(fitness)

            # reset env for each epoch
            agent = PolicyBasedTrader(policy=None, env=env)
            s = 0  # starting state
            a = get_next_action(agent, Q_copy, s, eps=eps)
            # print(f'#{worker_num}: Rollout for epoch {i}')
            while s is not None:  # rollout
                r, s_ = agent.take_action(a, s)
                with lock:
                    if s_ is not None:
                        a_ = get_next_action(agent, Q, s_, eps=eps)
                        q_update = alpha * (r + gamma*Q[s_, a_] - Q[s, a])
                    else:
                        q_update = alpha * (r - Q[s, a])
                        a_ = None

                    Q[s, a] += q_update

                s = s_
                a = a_
            eps = min_eps + (max_eps - min_eps)*np.exp(-decay_rate*i)
            progress.update()
        progress.close()
        return worker_num

    fig, ax = plt.subplots(figsize=(6, 4))
    fig.canvas.set_window_title('Agent evaluation')

    def build_live_chart(i):
        local_data = data[::]
        datax = np.arange(0, period*len(local_data), period)

        plt.xlabel('Iterations')
        plt.ylabel('Fitness')
        plt.title('Learning curve')
        ax.clear()
        ax.plot(datax, local_data, 'b', label='Score')
        ax.legend()
        ax.grid(True)

    workers = 1
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [
            executor.submit(run_iterations, i)
            for i in range(workers)
        ]
        ani = animation.FuncAnimation(fig, build_live_chart, interval=500)
        plt.show()
        result = concurrent.futures.wait(futures)
        plt.close()
        assert len(result.done) == workers

    # Save latest data
    if not play:
        model = SarsaModel(Q=Q, eps=eps)
        model.save()
    print('\nDone!')
    policy = extract_policy(agent, Q)

    if plot_chart:
        build_evaluation_chart(data, period=period)

    return policy
Пример #11
0
def q_learning(plot_chart=False):
    env = Environment()
    env.load(2018)
    agent = PolicyBasedTrader(policy=None, env=env)
    s_S = agent.states_space_size
    s_A = agent.actions_space_size
    print(f'States space size is {s_S}')
    print(f'Actions space size is {s_A}')

    alpha = 0.2  # learning rate
    gamma = 1  # discount factor
    eps = 0.8  # exploration factor, higher - more exploration
    model = load_model()
    Q = model if model is not None else np.zeros(shape=(s_S, s_A))

    EPOCHS = 500
    period = 5
    data = []

    lock = Lock()

    def run_iterations(worker_num=0):
        print(f'#{worker_num}: Running {EPOCHS} iterations in worker')
        for i in range(EPOCHS):
            Q_copy = Q.copy()  # do not lock, just evaluate on a recent copy
            if i % period == 0:
                print(f'#{worker_num}: Evaluating agent on {i} iteration...')
                fitness = evaluate_q(env, Q_copy)
                print(f'#{worker_num}: Current fitness: {fitness:.2f}')
                data.append(fitness)

            # reset env for each epoch
            agent = PolicyBasedTrader(policy=None, env=env)
            s = 0  # starting state
            print(f'#{worker_num}: Rollout for epoch {i}')
            while s is not None:  # rollout
                # do not allow other threads to update Q within a single step
                with lock:
                    a = get_next_action(agent, Q, s, eps)

                    r, s_ = agent.take_action(a, s)
                    # maximize Q for the next state
                    max_q = maximize_q(agent, Q, s_)

                    Q[s, a] = alpha * (r + gamma * max_q - Q[s, a])

                s = s_

    workers = 8
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = {
            executor.submit(run_iterations, i): i
            for i in range(workers)
        }
        for future in concurrent.futures.as_completed(futures):
            worker_num = futures[future]
            try:
                r = future.result()
                print(f'#{worker_num}: Finished!')
            except Exception as e:
                print(f'#{worker_num}: Failed with {e}')

    save_model(Q)
    policy = extract_policy(agent, Q)
    if plot_chart:
        build_evaluation_chart(data, period=period)

    return policy
Пример #12
0
def sarsa(play=False, plot_chart=False):
    env = Environment()
    # load smaller environment for just one month
    env.load(2018)
    agent = PolicyBasedTrader(policy=None, env=env)
    s_S = agent.states_space_size
    s_A = agent.actions_space_size
    print(f'States space size is {s_S}')
    print(f'Actions space size is {s_A}')
    print(f'Steps in environment is {env.size}')

    alpha = 1  # learning rate, discard old results immediately
    gamma = 1  # discount factor
    # load model from a file if saved previously
    model = SarsaModel.load() if play else None
    Q = model.Q if model is not None else np.zeros(shape=(s_S, s_A))
    if model is not None:
        print(f'Resuming with eps={model.eps}')

    min_eps = 0.01
    eps = 0.1  # start with exploration
    eps = model.eps if model is not None else 0.1
    max_eps = 1.0
    decay_rate = 0.05
    best_fitness = -np.inf

    EPOCHS = 2000
    period = 5
    data = []

    fig, ax = plt.subplots(figsize=(6, 4))
    fig.canvas.set_window_title('Agent evaluation')

    def build_live_chart(i):
        window = 20  # show N last values
        local_data = data[-window:]
        sv = (len(data) - window) * period if len(data) - window > 0 else 0
        ev = len(data) * period
        datax = np.arange(sv, ev, period)

        plt.xlabel('Iterations')
        plt.ylabel('Fitness')
        plt.title('Learning curve')
        ax.clear()
        ax.plot(datax, local_data, 'b', label='Score')
        ax.legend()
        ax.grid(True)

    def run_iterations():
        nonlocal eps, best_fitness
        print(f'Running {EPOCHS} epochs\n')
        for i in range(EPOCHS):
            if i % period == 0:
                print(f'Evaluating agent on {i} iteration...')
                fitness = evaluate_q(env, Q)
                if fitness > 0:
                    click.secho(f'We have positive fitness {fitness:.2f}',
                                fg='red')
                if fitness > best_fitness:
                    best_fitness = fitness
                data.append(fitness)

            # reset env for each epoch
            agent = PolicyBasedTrader(policy=None, env=env)
            s = 0  # starting state
            a = get_next_action(agent, Q, s, eps=eps)
            print(f'Rollout for epoch {i}')
            while s is not None:  # rollout
                r, s_ = agent.take_action(a, s)
                if s_ is not None:
                    a_ = get_next_action(agent, Q, s_, eps=eps)
                    q_update = alpha * (r + gamma * Q[s_, a_] - Q[s, a])
                else:
                    q_update = alpha * (r - Q[s, a])
                    a_ = None

                Q[s, a] += q_update

                s = s_
                a = a_
            eps = min_eps + (max_eps - min_eps) * np.exp(-decay_rate * i)

    ani = animation.FuncAnimation(fig, build_live_chart, interval=500)
    t = threading.Thread(target=run_iterations)
    t.start()
    plt.show()
    t.join()

    # Save latest data
    if not play:
        model = SarsaModel(Q=Q, eps=eps)
        model.save()
    print('\nDone!')
    click.secho(f'Best fitness {best_fitness:.2f}', fg='green')
    policy = extract_policy(agent, Q)

    if plot_chart:
        build_evaluation_chart(data, period=period)

    return policy