예제 #1
0
def train(filename, seed, alpha2):

    here = path.dirname(__file__)
    map_fn = path.join(here, "craft/options.map")

    rng1 = Random(seed + 1)
    env1 = Craft(map_fn,
                 rng1,
                 7,
                 7,
                 objects=OBJECTS4,
                 problem_mood=problem_mood)
    init1 = create_init([env1.get_all_item()[2]], [[7, 7]])
    options = create_options(env1.get_all_item(), [7, 7])
    tasks = [[OBJECTS4["target"]]]
    not_task = [OBJECTS4["key"]]
    tasks = tasks[START_TASK:END_TASK + 1]

    with open(filename, "w") as csvfile:

        print("ql: begin experiment")
        for j, goal in enumerate(tasks):
            print("ql: begin task {}".format(j + START_TASK))

            try:
                start = time()
                report1 = SequenceReport(csvfile, LOG_STEP, init1,
                                         EPISODE_LENGTH, TRIALS)
                reward1 = ReachFacts(env1, goal, not_task, problem_mood)
                policy1 = Empathic(alpha=1.0,
                                   gamma=1.0,
                                   epsilon=0.2,
                                   default_q=DEFAULT_Q,
                                   num_actions=4,
                                   rng=rng1,
                                   others_q=[],
                                   others_init=[[7, 7], [7, 7], [7, 7], [7, 7],
                                                [7, 7]],
                                   others_dist=[0.2, 0.2, 0.2, 0.2, 0.2],
                                   penalty=-2 * EPISODE_LENGTH,
                                   others_alpha=[alpha2],
                                   objects=OBJECTS4,
                                   problem_mood=problem_mood,
                                   options=options)
                agent1 = Agent(env1, policy1, reward1, rng1)

                agent1.train(steps=TOTAL_STEPS2,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report1)
                evaluate_agent(env1, policy1, reward1, init1)

            except KeyboardInterrupt:
                end = time()
                logging.warning("ql: interrupted task %s after %s seconds!",
                                j + START_TASK, end - start)
예제 #2
0
파일: craftfunc.py 프로젝트: praal/align
def train(filename, seed, emp_func):

    here = path.dirname(__file__)
    map_fn = path.join(here, "craft/doll.map")

    rng1 = Random(seed + 1)
    env1 = Craft(map_fn,
                 rng1,
                 1,
                 5,
                 objects=OBJECTS2,
                 problem_mood=problem_mood)

    rng2 = Random(seed + 2)
    env2 = Craft(map_fn,
                 rng2,
                 4,
                 1,
                 objects=OBJECTS2,
                 problem_mood=problem_mood)

    rng3 = Random(seed + 3)
    env3 = Craft(map_fn,
                 rng3,
                 5,
                 2,
                 objects=OBJECTS2,
                 problem_mood=problem_mood)

    rng4 = Random(seed + 4)
    env4 = Craft(map_fn,
                 rng4,
                 1,
                 3,
                 objects=OBJECTS2,
                 problem_mood=problem_mood)

    rng5 = Random(seed + 5)
    env5 = Craft(map_fn,
                 rng5,
                 1,
                 5,
                 objects=OBJECTS2,
                 problem_mood=problem_mood)

    rng6 = Random(seed + 6)
    env6 = Craft(map_fn,
                 rng6,
                 1,
                 5,
                 objects=OBJECTS2,
                 problem_mood=problem_mood)

    init1 = create_init([env1.get_all_item()[1]], [[1, 5]])
    init2 = create_init(env2.get_all_item(), [[4, 1]])
    init3 = create_init(env3.get_all_item(), [[5, 2]])
    init4 = create_init(env4.get_all_item(), [[1, 3]])
    init5 = create_init([env5.get_all_item()[1]], [[1, 5]])
    init6 = create_init([env5.get_all_item()[1]], [[1, 5]])

    tasks = [[OBJECTS2["played"]]]
    not_task = [OBJECTS2["doll"]]
    tasks = tasks[START_TASK:END_TASK + 1]

    with open(filename, "w") as csvfile:

        print("ql: begin experiment")
        report1 = SequenceReport(csvfile, LOG_STEP, init1, EPISODE_LENGTH,
                                 TRIALS)
        report2 = SequenceReport(csvfile, LOG_STEP, init2, EPISODE_LENGTH,
                                 TRIALS)
        report3 = SequenceReport(csvfile, LOG_STEP, init3, EPISODE_LENGTH,
                                 TRIALS)
        report4 = SequenceReport(csvfile, LOG_STEP, init4, EPISODE_LENGTH,
                                 TRIALS)
        report5 = SequenceReport(csvfile, LOG_STEP, init5, EPISODE_LENGTH,
                                 TRIALS)
        report6 = SequenceReport(csvfile, LOG_STEP, init6, EPISODE_LENGTH,
                                 TRIALS)

        for j, goal in enumerate(tasks):
            print("ql: begin task {}".format(j + START_TASK))
            rng2.seed(seed + j)

            reward2 = ReachFacts(env2, goal, [], problem_mood)
            policy2 = EpsilonGreedy(alpha=1.0,
                                    gamma=1.0,
                                    epsilon=0.2,
                                    default_q=DEFAULT_Q,
                                    num_actions=4,
                                    rng=rng2)
            agent2 = Agent(env2, policy2, reward2, rng2)

            rng3.seed(seed + j)
            reward3 = ReachFacts(env3, goal, [], problem_mood)
            policy3 = EpsilonGreedy(alpha=1.0,
                                    gamma=1.0,
                                    epsilon=0.2,
                                    default_q=DEFAULT_Q,
                                    num_actions=4,
                                    rng=rng3)
            agent3 = Agent(env3, policy3, reward3, rng3)

            rng4.seed(seed + j)
            reward4 = ReachFacts(env4, goal, [], problem_mood)
            policy4 = EpsilonGreedy(alpha=1.0,
                                    gamma=1.0,
                                    epsilon=0.2,
                                    default_q=DEFAULT_Q,
                                    num_actions=4,
                                    rng=rng4)
            agent4 = Agent(env4, policy4, reward4, rng4)

            rng6.seed(seed + j)
            reward6 = ReachFacts(env6, goal, [], problem_mood)
            policy6 = EpsilonGreedy(alpha=1.0,
                                    gamma=1.0,
                                    epsilon=0.2,
                                    default_q=DEFAULT_Q,
                                    num_actions=4,
                                    rng=rng6)
            agent6 = Agent(env6, policy6, reward6, rng6)

            try:

                agent2.train(steps=TOTAL_STEPS1,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report2)
                agent3.train(steps=TOTAL_STEPS1,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report3)

                agent4.train(steps=TOTAL_STEPS1,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report4)

                agent6.train(steps=TOTAL_STEPS1,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report6)

                report1 = SequenceReport(csvfile, LOG_STEP, init1,
                                         EPISODE_LENGTH, TRIALS)
                reward1 = ReachFacts(env1, goal, not_task, problem_mood)
                policy1 = Empathic(alpha=1.0,
                                   gamma=1.0,
                                   epsilon=0.2,
                                   default_q=DEFAULT_Q,
                                   num_actions=5,
                                   rng=rng1,
                                   others_q=[
                                       policy2.get_Q(),
                                       policy2.get_Q(),
                                       policy3.get_Q(),
                                       policy3.get_Q(),
                                       policy4.get_Q()
                                   ],
                                   others_init=[[4, 1], [4, 1], [5, 2], [5, 2],
                                                [1, 3]],
                                   others_dist=[0.2, 0.2, 0.2, 0.2, 0.2],
                                   penalty=-2 * EPISODE_LENGTH,
                                   others_alpha=[5.0, 5.0, 5.0, 5.0, 5.0],
                                   objects=OBJECTS2,
                                   problem_mood=problem_mood,
                                   caring_func=emp_func)
                agent1 = Agent(env1, policy1, reward1, rng1)

                agent1.train(steps=TOTAL_STEPS2,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report1)

                reward5 = ReachFacts(env5, goal, not_task, problem_mood)
                policy5 = Empathic(alpha=1.0,
                                   gamma=1.0,
                                   epsilon=0.2,
                                   default_q=DEFAULT_Q,
                                   num_actions=5,
                                   rng=rng5,
                                   others_q=[policy6.get_Q()],
                                   others_init=[[1, 5]],
                                   others_dist=[1.0],
                                   penalty=-2 * EPISODE_LENGTH,
                                   others_alpha=[5.0],
                                   objects=OBJECTS2,
                                   problem_mood=problem_mood)
                agent5 = Agent(env5, policy5, reward5, rng5)

                if emp_func == "baseline":
                    agent5.train(steps=TOTAL_STEPS2,
                                 steps_per_episode=EPISODE_LENGTH,
                                 report=report5)
                    evaluate_agent(env5, policy5, reward5, init5)

                elif emp_func == "nonaug":
                    evaluate_agent(env6, policy6, reward6, init6)
                else:
                    evaluate_agent(env1, policy1, reward1, init1)

            except KeyboardInterrupt:

                logging.warning("ql: interrupted task %s after %s seconds!",
                                j + START_TASK, end - start)
예제 #3
0
파일: kitchen_salt.py 프로젝트: praal/align
def train(filename, seed):
    problem_mood = 4
    here = path.dirname(__file__)
    map_fn = path.join(here, "craft/kitchen.map")

    rng1 = Random(seed + 1)
    env1 = Kitchen(map_fn, rng1, 1, 1, problem_mood=problem_mood)
    init1 = create_init([[1, 1]])

    rng4 = Random(seed + 4)
    env4 = Kitchen(map_fn, rng4, 1, 1, problem_mood=problem_mood)
    init4 = create_init([[1, 1]])

    rng2 = Random(seed + 2)
    env2 = Kitchen(map_fn, rng2, 1, 1, problem_mood=problem_mood, short=False)
    init2 = create_init([[1, 1]])

    rng3 = Random(seed + 3)
    env3 = Kitchen(map_fn, rng3, 1, 1, problem_mood=problem_mood, short=True)
    init3 = create_init([[1, 1]])

    tasks = [[OBJECTS4["food"]]]
    not_task = []
    tasks = tasks[START_TASK:END_TASK + 1]

    with open(filename, "w") as csvfile:

        print("ql: begin experiment")
        report2 = SequenceReport(csvfile, LOG_STEP, init2, EPISODE_LENGTH,
                                 TRIALS)
        report3 = SequenceReport(csvfile, LOG_STEP, init3, EPISODE_LENGTH,
                                 TRIALS)

        for j, goal in enumerate(tasks):
            print("ql: begin task {}".format(j + START_TASK))
            rng2.seed(seed + j)

            reward2 = ReachFacts(env2, goal, [])
            policy2 = EpsilonGreedy(alpha=1.0,
                                    gamma=1.0,
                                    epsilon=0.2,
                                    default_q=DEFAULT_Q,
                                    num_actions=4,
                                    rng=rng2)
            agent2 = Agent(env2, policy2, reward2, rng2)

            rng3.seed(seed + j)

            reward3 = ReachFacts(env3, goal, [])
            policy3 = BaseEmpathic(alpha=1.0,
                                   gamma=1.0,
                                   epsilon=0.2,
                                   default_q=DEFAULT_Q,
                                   num_actions=5,
                                   rng=rng3,
                                   others_q=[policy2.get_Q()],
                                   others_init=[[1, 1]],
                                   others_dist=[1.0],
                                   penalty=-2 * EPISODE_LENGTH,
                                   others_alpha=[0.0],
                                   objects=OBJECTS4,
                                   problem_mood=problem_mood)

            agent3 = Agent(env3, policy3, reward3, rng3)

            try:

                agent2.train(steps=TOTAL_STEPS1,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report2)
                agent3.train(steps=TOTAL_STEPS1,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report3)

                report1 = SequenceReport(csvfile, LOG_STEP, init1,
                                         EPISODE_LENGTH, TRIALS)
                reward1 = ReachFacts(env1, goal, not_task)

                policy1 = BaseEmpathic(alpha=1.0,
                                       gamma=1.0,
                                       epsilon=0.2,
                                       default_q=DEFAULT_Q,
                                       num_actions=6,
                                       rng=rng1,
                                       others_q=[policy2.get_Q()],
                                       others_init=[[1, 1]],
                                       others_dist=[1.0],
                                       penalty=-2 * EPISODE_LENGTH,
                                       others_alpha=[0.1],
                                       objects=OBJECTS4,
                                       problem_mood=problem_mood)
                agent1 = Agent(env1, policy1, reward1, rng1)

                agent1.train(steps=TOTAL_STEPS2,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report1)
                report4 = SequenceReport(csvfile, LOG_STEP, init4,
                                         EPISODE_LENGTH, TRIALS)
                reward4 = ReachFacts(env4, goal, not_task)
                policy4 = BaseEmpathic(alpha=1.0,
                                       gamma=1.0,
                                       epsilon=0.2,
                                       default_q=DEFAULT_Q,
                                       num_actions=6,
                                       rng=rng4,
                                       others_q=[policy3.get_Q()],
                                       others_init=[[1, 1]],
                                       others_dist=[1.0],
                                       penalty=-2 * EPISODE_LENGTH,
                                       others_alpha=[1.0],
                                       objects=OBJECTS4,
                                       problem_mood=problem_mood)
                agent4 = Agent(env4, policy4, reward4, rng4)

                agent4.train(steps=TOTAL_STEPS2,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report4)

                print("-----------------")
                print("The single agent in the env without considering others")
                base_step = evaluate_agent(env2, policy2, reward2, init2)
                print("Total Reward = ", base_step)
                print("-----------------")
                print("Baseline 1, Q-learning")
                step2 = evaluate_second_agent(env2, policy2, reward2, env3,
                                              policy3, reward3)
                print("Total Reward Compared to Base = ",
                      -1 * (step2[0] - base_step), -1 * (step2[1] - base_step))
                print("-----------------")
                print("Baseline 2, Krakovna et al")
                step1 = evaluate_second_agent(env1, policy1, reward1, env3,
                                              policy3, reward3)
                print("Total Reward Compared to Base= ",
                      -1 * (step1[0] - base_step), -1 * (step1[1] - base_step))
                print("-----------------")
                print("Our Approach")
                step3 = evaluate_second_agent(env4, policy4, reward4, env3,
                                              policy3, reward3)
                print("Total Reward Compared to Base= ",
                      -1 * (step3[0] - base_step), -1 * (step3[1] - base_step))
                print("-----------------")

            except KeyboardInterrupt:

                logging.warning("ql: interrupted task %s after %s seconds!",
                                j + START_TASK, end - start)
예제 #4
0
def train(filename, seed, foldername, alpha_start, alpha_end):

    here = path.dirname(__file__)
    map_fn = path.join(here, "craft/map_seq.map")

    rng1 = Random(seed)
    env1 = Craft(map_fn,
                 rng1,
                 1,
                 1,
                 objects=OBJECTS1,
                 problem_mood=problem_mood,
                 tool_in_fact=True,
                 wood_in_fact=True)
    init = create_init(env1.get_all_item(), [[1, 1]], True)

    tasks = [[OBJECTS1["box"]]]
    not_task = []
    tasks = tasks[START_TASK:END_TASK + 1]

    with open(filename, "w") as csvfile:

        print("ql: begin experiment")
        report = SequenceReport(csvfile, LOG_STEP, init, EPISODE_LENGTH,
                                TRIALS)

        for j, goal in enumerate(tasks):
            print("ql: begin task {}".format(j + START_TASK))
            rng1.seed(seed + j)

            reward1 = ReachFacts(env1, goal, not_task, problem_mood)
            policy1 = EpsilonGreedy(alpha=1.0,
                                    gamma=1.0,
                                    epsilon=0.5,
                                    default_q=DEFAULT_Q,
                                    num_actions=4,
                                    rng=rng1)
            agent = Agent(env1, policy1, reward1, rng1)

            try:
                start = time()
                agent.train(steps=TOTAL_STEPS1,
                            steps_per_episode=EPISODE_LENGTH,
                            report=report)

                for alpha in range(alpha_start, alpha_end):

                    print("alpha:", alpha / 100.0)
                    rng2 = Random(seed + 1)
                    env2 = Craft(map_fn,
                                 rng2,
                                 1,
                                 1,
                                 objects=OBJECTS1,
                                 problem_mood=problem_mood)
                    init2 = create_init(env2.get_all_item(), [[1, 1]])
                    report2 = SequenceReport(csvfile, LOG_STEP, init2,
                                             EPISODE_LENGTH, TRIALS)
                    reward2 = ReachFacts(env2, goal, not_task, problem_mood)
                    policy2 = Empathic(alpha=1.0,
                                       gamma=1.0,
                                       epsilon=0.5,
                                       default_q=DEFAULT_Q,
                                       num_actions=5,
                                       rng=rng2,
                                       others_q=[policy1.get_Q()],
                                       penalty=-2 * EPISODE_LENGTH,
                                       others_alpha=[alpha / 100.0],
                                       objects=OBJECTS1,
                                       problem_mood=problem_mood)
                    agent2 = Agent(env2, policy2, reward2, rng2)

                    agent2.train(steps=TOTAL_STEPS2,
                                 steps_per_episode=EPISODE_LENGTH,
                                 report=report2)
                    test(env2, policy2, reward2, env1, policy1, reward1, init,
                         alpha, foldername)

            except KeyboardInterrupt:
                end = time()
                logging.warning("ql: interrupted task %s after %s seconds!",
                                j + START_TASK, end - start)
예제 #5
0
파일: craftdiff.py 프로젝트: praal/align
def train(filename, seed, alpha2):

    here = path.dirname(__file__)
    map_fn = path.join(here, "craft/garden.map")

    rng1 = Random(seed + 1)
    env1 = Craft(map_fn,
                 rng1,
                 1,
                 5,
                 objects=OBJECTS3,
                 problem_mood=problem_mood)

    rng2 = Random(seed + 2)
    env2 = Craft(map_fn,
                 rng2,
                 1,
                 5,
                 objects=OBJECTS3,
                 problem_mood=problem_mood,
                 fence=True)

    init1 = create_init(env1.get_all_item(), [[1, 5]])
    init2 = create_init(env1.get_all_item(), [[1, 5]], True)

    tasks = [[OBJECTS3["target"]]]
    not_task = []
    tasks = tasks[START_TASK:END_TASK + 1]

    with open(filename, "w") as csvfile:

        print("ql: begin experiment")
        report1 = SequenceReport(csvfile, LOG_STEP, init1, EPISODE_LENGTH,
                                 TRIALS)
        report2 = SequenceReport(csvfile, LOG_STEP, init2, EPISODE_LENGTH,
                                 TRIALS)

        for j, goal in enumerate(tasks):
            print("ql: begin task {}".format(j + START_TASK))
            rng2.seed(seed + j)

            reward2 = ReachFacts(env2, goal, [], problem_mood)
            policy2 = EpsilonGreedy(alpha=1.0,
                                    gamma=1.0,
                                    epsilon=0.2,
                                    default_q=DEFAULT_Q,
                                    num_actions=4,
                                    rng=rng2)
            agent2 = Agent(env2, policy2, reward2, rng2)

            try:
                start = time()
                agent2.train(steps=TOTAL_STEPS1,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report2)

                reward1 = ReachFacts(env1, goal, not_task, problem_mood)
                policy1 = Empathic(alpha=1.0,
                                   gamma=1.0,
                                   epsilon=0.2,
                                   default_q=DEFAULT_Q,
                                   num_actions=5,
                                   rng=rng1,
                                   others_q=[policy2.get_Q(),
                                             policy2.get_Q()],
                                   others_init=[[1, 5], [1, 5]],
                                   others_dist=[0.5, 0.5],
                                   penalty=-2 * EPISODE_LENGTH,
                                   others_alpha=[alpha2, 1.0],
                                   objects=OBJECTS3,
                                   problem_mood=problem_mood,
                                   caring_func="sum",
                                   restricted=[[2, 1], []])
                agent1 = Agent(env1, policy1, reward1, rng1)

                agent1.train(steps=TOTAL_STEPS2,
                             steps_per_episode=EPISODE_LENGTH,
                             report=report1)
                evaluate_agent(env1, policy1, reward1, init1)

            except KeyboardInterrupt:
                end = time()
                logging.warning("ql: interrupted task %s after %s seconds!",
                                j + START_TASK, end - start)