Exemplo n.º 1
0
def curiosity(world):
    world = ActionNoise(world, stddev=0.2)
    memory = Cache(max_size=100)

    log_dir = "__oracle"
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    agent = build_agent()
    agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5)

    oracle = build_oracle()
    oracle_opt = Adam(np.random.randn(oracle.n_params) * 0.1,
                      lr=0.05,
                      memory=0.95)

    for episode in range(1000):
        agent.load_params(agent_opt.get_value())
        oracle.load_params(oracle_opt.get_value())

        agent_trajs = world.trajectories(agent, 4)
        for_oracle = [[(np.asarray([o1, o2, o3]).flatten(), a1, r1)
                       for (o1, a1, r1), (o2, a2,
                                          r2), (o3, a3,
                                                r3) in zip(t, t[1:], t[2:])]
                      for t in agent_trajs]
        memory.add_trajectories(for_oracle)

        predictions = retrace(for_oracle, model=oracle)
        save_plot(log_dir + "/%04d.png" % (episode + 1), agent_trajs,
                  predictions)
        np.save(log_dir + "/%04d.npy" % (episode + 1), agent_opt.get_value())

        curiosity_trajs = [[
            (o1, a1, np.log(np.mean(np.square((o2 - o1) - delta_p))))
            for (o1, a1, r1), (o2, a2, r2), delta_p in zip(t, t[10:], p)
        ] for t, p in zip(agent_trajs, predictions)]
        #curiosity_trajs = replace_rewards(curiosity_trajs,
        #    episode=lambda rs: np.max(rs))
        print_reward(curiosity_trajs, max_value=5000.0)
        print_reward(agent_trajs, max_value=90.0, episode=np.sum)

        curiosity_trajs = discount(curiosity_trajs, horizon=500)
        curiosity_trajs = normalize(curiosity_trajs)
        agent_trajs = discount(agent_trajs, horizon=500)
        agent_trajs = normalize(agent_trajs)
        agent_trajs = [traj[:-10] for traj in agent_trajs]
        agent_weight = 0.5  # + 0.4*(0.5 * (1 - np.cos(np.pi * episode / 20)))
        curiosity_weight = 1. - agent_weight
        comb_trajs = combine_rewards([curiosity_trajs, agent_trajs],
                                     [curiosity_weight, agent_weight])
        grad = policy_gradient(comb_trajs, policy=agent)
        agent_opt.apply_gradient(grad)

        oracle_trajs = [[(o1, (o2 - o1)[:2], 1.0)
                         for (o1, a1, r1), (o2, a2, r2) in zip(t, t[10:])]
                        for t in memory.trajectories(None, 4)]

        grad = policy_gradient(oracle_trajs, policy=oracle)
        oracle_opt.apply_gradient(grad)
Exemplo n.º 2
0
    def train_one(carrOpt):
        if carrOpt == None:
            carrOpt = Adam(
                np.random.randn(curCarr.n_params),
                lr=0.10,
                memory=0.5,
            )
        nextBreak = 5
        for i in range(250):
            curCarr.load_params(carrOpt.get_value())

            realTrajs, curiosityTrajs = world.trajectories(curCarr, 50)
            curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 90.
            print_reward(realTrajs,
                         max_value=90.0,
                         episode=np.sum,
                         label="Real reward:      ")
            print_reward(curiosityTrajs,
                         max_value=1.0,
                         episode=np.max,
                         label="Curiosity reward: ")
            curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max))
            if curCuriosity > 0.98:
                if nextBreak == 0:
                    break
                else:
                    nextBreak -= 1
            else:
                nextBreak = np.min([nextBreak + 1, 5])

            realTrajs = replace_rewards(realTrajs, episode=np.sum)
            realTrajs = normalize(realTrajs)
            curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max)
            #this is stupid, we should care more(?) if the costs are to high
            realWeight = 0.001 + np.max([np.min([curScore, 0.2]), 0.
                                         ]) * 0.998 / 0.2
            curiosityWeight = 1. - realWeight
            print('RWeight: %f, CWeight: %f' % (realWeight, curiosityWeight))
            trajs = combine_rewards([realTrajs, curiosityTrajs],
                                    [realWeight, curiosityWeight])
            trajs = normalize(trajs)
            grad = policy_gradient(trajs, policy=curCarr)
            carrOpt.apply_gradient(grad)
            if i % 10 == 0:
                print("%d episodes in." % i)
        world.remember_agent(curCarr)
        world.render(curCarr)
        if curScore > 0.01:
            return carrOpt
        else:
            return None
Exemplo n.º 3
0
def run():
    model = Input(4)
    model = Affine(model, 128)
    model = LReLU(model)
    model = Affine(model, 2)
    model = Softmax(model)

    world = StochasticPolicy(Gym(make_env, max_steps=500))

    opt = Adam(np.random.randn(model.n_params) * 0.1, lr=0.01)

    for _ in range(50):
        model.load_params(opt.get_value())

        trajs = world.trajectories(model, 16)
        print_reward(trajs, max_value=5000)

        trajs = discount(trajs, horizon=500)
        trajs = normalize(trajs)

        grad = policy_gradient(trajs, policy=model)
        opt.apply_gradient(grad)

    while True:
        world.render(model)
Exemplo n.º 4
0
    def train_one(carrOpt):
        nonlocal oldTrajs
        classOpt = Adam(
            np.random.randn(classifier.n_params) * 1.,
            lr=0.5,
            memory=0.9,
        )
        if carrOpt == None:
            carrOpt = Adam(
                np.random.randn(curCarr.n_params),
                lr=0.10,
                memory=0.5,
            )
        curScore = 0.
        curAccuracy = 0.
        for i in range(250):
            classifier.load_params(classOpt.get_value())
            curCarr.load_params(carrOpt.get_value())

            oldTrajIdx = np.random.choice(len(oldTrajs), size=50)
            trajs = [oldTrajs[i] for i in oldTrajIdx]
            trajs += world.trajectories(curCarr, 50)
            trajsForClass = [tag_traj(traj, [1, 0]) for traj in trajs[:50]]
            trajsForClass += [tag_traj(traj, [0, 1]) for traj in trajs[50:]]
            plot_tagged_trajs(trajsForClass)
            accTrajs = accuracy(trajsForClass, model=classifier)
            print_reward(accTrajs,
                         max_value=1.0,
                         episode=np.mean,
                         label="Cla reward: ")
            curAccuracy = np.mean(get_rewards(accTrajs, episode=np.mean))
            if curAccuracy > 1. - i / 500:
                break

            grad = policy_gradient(trajsForClass, policy=classifier)
            classOpt.apply_gradient(grad)
            trajs2 = learn_from_classifier(classifier, trajs[50:], 1)
            print_reward(trajs2,
                         max_value=1.0,
                         episode=np.max,
                         label="Car reward: ")
            curScore = np.mean(get_rewards(trajs2, episode=np.max))
            trajs2 = replace_rewards(trajs2, episode=np.max)
            trajs2 = normalize(trajs2)
            grad2 = policy_gradient(trajs2, policy=curCarr)
            carrOpt.apply_gradient(grad2)
            if i % 10 == 0:
                print("%d episodes in." % i)
        oldTrajs += world.trajectories(curCarr, 800)
        world.render(curCarr)
        if curScore > 0.11:
            return carrOpt
        else:
            return None
Exemplo n.º 5
0
def curiosity(world):
    world = ActionNoise(world, stddev=0.1)
    memory = Cache(max_size=100)

    log_dir = "__oracle"
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    agent = build_agent()
    agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5)

    oracle = build_oracle()
    oracle_opt = Adam(np.random.randn(oracle.n_params) * 0.1,
                      lr=0.05,
                      memory=0.95)

    for episode in range(1000):
        agent.load_params(agent_opt.get_value())
        oracle.load_params(oracle_opt.get_value())

        agent_trajs = world.trajectories(agent, 4)
        memory.add_trajectories(agent_trajs)

        predictions = retrace(agent_trajs, model=oracle)
        save_plot(log_dir + "/%04d.png" % (episode + 1), agent_trajs,
                  predictions)
        np.save(log_dir + "/%04d.npy" % (episode + 1), agent_opt.get_value())

        agent_trajs = [[
            (o1, a1, np.log(np.mean(np.square((o2 - o1) - delta_p))))
            for (o1, a1, r1), (o2, a2, r2), delta_p in zip(t, t[10:], p)
        ] for t, p in zip(agent_trajs, predictions)]
        agent_trajs = replace_rewards(agent_trajs,
                                      episode=lambda rs: np.max(rs) / len(rs))
        print_reward(agent_trajs, max_value=10.0)

        agent_trajs = normalize(agent_trajs)
        grad = policy_gradient(agent_trajs, policy=agent)
        agent_opt.apply_gradient(grad)

        oracle_trajs = [[(o1, o2 - o1, 1.0)
                         for (o1, a1, r1), (o2, a2, r2) in zip(t, t[10:])]
                        for t in memory.trajectories(None, 4)]

        grad = policy_gradient(oracle_trajs, policy=oracle)
        oracle_opt.apply_gradient(grad)
Exemplo n.º 6
0
def curiosity(world):
    world = ActionNoise(world, stddev=0.1)
    memory = Cache(delay=32 * 25)

    log_dir = "__car"
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    agent = build_agent()
    agent_opt = Adams(np.random.randn(agent.n_params), lr=0.00015, memory=0.5)

    classifier = build_classifier()
    classifier_opt = Adams(np.random.randn(classifier.n_params),
                           lr=0.00005,
                           memory=0.9)

    for episode in range(1000):
        agent.load_params(agent_opt.get_value())
        classifier.load_params(classifier_opt.get_value())

        agent_trajs = world.trajectories(agent, 32)
        memory.add_trajectory(*agent_trajs)

        classifier_trajs = (supervised(memory.trajectories(None, 32),
                                       label=lambda *_: [1, 0]) +
                            supervised(agent_trajs, label=lambda *_: [0, 1]))

        grad = policy_gradient(classifier_trajs, policy=classifier)
        classifier_opt.apply_gradient(grad)

        agent_trajs = replace_rewards(
            agent_trajs,
            model=classifier,
            reward=lambda o: o[1],
            episode=lambda rs: np.square(rs) / len(rs))

        save_plot(log_dir + "/%04d.png" % (episode + 1), classifier,
                  classifier_trajs)

        print_reward(agent_trajs, max_value=1.0)

        agent_trajs = discount(agent_trajs, horizon=100)
        agent_trajs = normalize(agent_trajs)
        grad = policy_gradient(agent_trajs, policy=agent)
        agent_opt.apply_gradient(grad)
Exemplo n.º 7
0
def train(world, model):
    opt = Adams(
        np.random.randn(model.n_params) * 0.1,
        lr=0.0001,
        memory=0.8
    )

    while True:
        model.load_params(opt.get_value())

        trajs = world.trajectories(model, 16)
        print_reward(trajs, max_value=500)

        if np.mean(get_rewards(trajs, episode=np.sum)) >= 498:
            return opt.get_value()

        trajs = discount(trajs, horizon=500)
        trajs = normalize(trajs)

        grad = policy_gradient(trajs, policy=model)
        opt.apply_gradient(grad)
Exemplo n.º 8
0
    def train_one():
        gaussOpt = Adam(
            [0., 0.],
            lr=0.010,
            memory=0.5,
        )
        classOpt = Adam(np.random.randn(classifier.n_params) * 0.1,
                        lr=0.5,
                        memory=0.99)
        gaussCenterer = Constant(2)
        gausses.append(gaussCenterer)
        curAccuracy = 0.
        while curAccuracy < 0.98:
            classifier.load_params(classOpt.get_value())
            gaussCenterer.load_params(gaussOpt.get_value())

            trajs = [[(gauss_observation(gausses[:-1]), [1, 0], 1.)]
                     for _ in range(500)]
            trajs += [[(gauss_observation(gausses[-1:]), [0, 1], 1.)]
                      for _ in range(500)]
            accTrajs = accuracy(trajs, model=classifier)
            print_reward(accTrajs, max_value=1.0)
            accs = [traj[0][2] for traj in accTrajs]
            curAccuracy = np.mean(accs)

            grad = policy_gradient(trajs, policy=classifier)
            classOpt.apply_gradient(grad)
            trajs2 = learn_from_classifier(classifier, trajs[500:], 1)
            trajs2 = normalize(trajs2)
            grad2 = policy_gradient(trajs2, policy=gaussCenterer)
            gaussOpt.apply_gradient(grad2)
            plt.clf()
            plt.grid()
            plt.gcf().axes[0].set_ylim([-1, 1])
            plt.gcf().axes[0].set_xlim([-1, 1])
            x, y = zip(*[o for ((o, _, _), ) in trajs[:500]])
            plt.scatter(x, y, color="blue")
            x, y = zip(*[o for ((o, _, _), ) in trajs[500:]])
            plt.scatter(x, y, color="red")
            plt.pause(0.01)
Exemplo n.º 9
0
def run():
    classifier = Input(7)
    classifier = Affine(classifier, 32)
    classifier = LReLU(classifier)
    classifier = Affine(classifier, 2)
    classifier = Softmax(classifier)

    agent = walker()
    agent.load_params(np.random.randn(agent.n_params) * 1.5)

    MAX_TRAIN_TIME = 200
    trainTimeLeft = MAX_TRAIN_TIME
    curAgentId = -1
    curMemoryId = 0

    def plot_tagged_trajs(trajs):
        nonlocal trainTimeLeft, curAgentId, curMemoryId
        COLORS = ["blue", "red"]
        plt.clf()
        plt.grid()
        plt.gcf().axes[0].set_xlim([-1.25, 1.25])
        plt.gcf().axes[0].set_ylim([-1.25, 1.25])
        plt.suptitle("Episode %d of agent %d, memories: %d" %
                     (MAX_TRAIN_TIME - trainTimeLeft, curAgentId, curMemoryId))
        for traj in trajs:
            tag = traj[0][1]
            xs, ys = [], []
            for state, _, _ in traj:
                x = state[2]
                y = state[3]
                xs.append(x)
                ys.append(y)
            plt.plot(xs, ys, color=COLORS[np.argmax(tag)], alpha=0.1)
        plt.gcf().set_size_inches(10, 8)
        plt.gcf().savefig("__step_a%03d_t%03d.png" %
                          (curAgentId, MAX_TRAIN_TIME - trainTimeLeft),
                          dpi=100)

    world = Gym("BipedalWalker-v2", max_steps=MAX_STEPS)
    world = ActionNoise(world, stddev=0.2)
    world = Curiosity(world,
                      classifier=classifier,
                      history_length=50,
                      for_classifier=lambda ts: change_obs_space(
                          ts, changer=interesting_part),
                      plot=plot_tagged_trajs)
    MAX_BOREDOM = 3
    boredom = MAX_BOREDOM

    MAX_MOTIVATION = 3
    motivation = MAX_MOTIVATION

    agentOpt = None
    lastScores = None

    def memorize():
        nonlocal boredom, curMemoryId
        print("Memorizing %d..." % curMemoryId)
        world.remember(agent)
        boredom = MAX_BOREDOM
        curMemoryId += 1

    def save_agent():
        np.save(
            "__ranger_a%03d_t%03d.npy" %
            (curAgentId, MAX_TRAIN_TIME - trainTimeLeft), agentOpt.get_value())

    def reset_agent():
        nonlocal agentOpt, trainTimeLeft, lastScores, curAgentId, motivation
        if agentOpt is not None:
            save_agent()
        print("Resetting agent %d." % curAgentId)
        agentOpt = Adam(
            np.random.randn(agent.n_params) * 1.5,
            lr=0.05,
            memory=0.9,
        )
        trainTimeLeft = MAX_TRAIN_TIME
        lastScores = [-0.4]
        curAgentId += 1
        motivation = MAX_MOTIVATION

    reset_agent()
    while True:
        agent.load_params(agentOpt.get_value())

        realTrajs, curiosityTrajs = world.trajectories(agent, 30)
        curScore = np.mean(get_rewards(realTrajs, episode=np.sum)) / 300.
        lastScores.append(curScore)
        lastScores = lastScores[-10:]
        scoreDev = np.std(lastScores)
        scoreMean = np.max([np.abs(np.mean(lastScores)), 1.])

        curCuriosity = np.mean(get_rewards(curiosityTrajs, episode=np.max))

        print_reward(realTrajs,
                     max_value=300.0,
                     episode=np.sum,
                     label="Real reward:      ")
        print_reward(curiosityTrajs,
                     max_value=1.0,
                     episode=np.max,
                     label="Curiosity reward: ")
        if curCuriosity > 0.85:
            if boredom == 0:
                save_agent()
                memorize()
            else:
                boredom -= 1
        else:
            boredom = np.min([boredom + 1, MAX_BOREDOM])

        if scoreDev / scoreMean < 0.010 or trainTimeLeft < 0:
            if motivation == 0:
                print("Not really learning.")
                save_agent()
                motivation = MAX_MOTIVATION
                trainTimeLeft = MAX_TRAIN_TIME
                if curScore < 0.01:
                    memorize()
                    reset_agent()
                    continue
            else:
                motivation -= 1
        else:
            motivation = np.min([motivation + 1, MAX_MOTIVATION])

        realTrajs = discount(realTrajs, horizon=200)
        realTrajs = normalize(realTrajs)
        curiosityTrajs = replace_rewards(curiosityTrajs, episode=np.max)
        realWeight = np.min([scoreDev / scoreMean * 10., 0.9])
        curiosityWeight = 1. - realWeight
        trajs = combine_rewards([realTrajs, curiosityTrajs],
                                [realWeight, curiosityWeight])
        trajs = normalize(trajs)
        grad = policy_gradient(trajs, policy=agent)
        agentOpt.apply_gradient(grad)

        trainTimeLeft -= 1