Пример #1
0
class Team(object):
    def __init__(self, living, task, learner = ENAC()):
        self.living = living
        self.task = task
        self.last_reward = 0
        self.agent = LearningAgent(self.living.brain, learner)
        self.oldparams = self.living.brain.params
    def Interaction(self):
        self.agent.integrateObservation(self.task.getObservation())
        self.task.performAction(self.agent.getAction())
        self.last_reward = self.task.getReward()
        self.agent.giveReward(self.last_reward)
        
        finished = self.task.isFinished()
        if finished:
            #print task.cumreward
            self.agent.newEpisode()
            self.task.reset()
        return self.last_reward, finished
    
    def Learn(self, episodes = 1):    
        self.agent.learn(episodes)
        self.agent.reset()
                        
        newparams = self.living.brain.params.copy() #get_all_weights(eater.brain)[:]
        dif = 0
        j = 0
        for i in newparams:
            dif += (self.oldparams[j] - newparams[j])**2
            j += 1
        self.oldparams = newparams
        return dif
def q_learning_table():
    controller = ActionValueTable(36, 4)
    learner = Q()
    controller.initialize(1.)

    agent = LearningAgent(controller, learner)

    score_list = []
    turn_list = []
    # neural側のトレーニング分 +100
    for i in range(600):
        print_state(agent.module.getValue, 'table')

        score, turn = play(agent, 'table')
        score_list.append(score)
        turn_list.append(turn)

        agent.learn()
        agent.reset()

        print i, int(numpy.mean(score_list)), max(score_list), score, turn

        with open('./agent.dump', 'w') as f:
            pickle.dump(agent, f)
        with open('./score.dump', 'w') as f:
            pickle.dump([score_list, turn_list], f)
Пример #3
0
def test_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = np.array([[1, 1, 1, 1, 1],
                          [1, 0, 0, 0, 1],
                          [1, 0, 1, 0, 1],
                          [1, 0, 1, 0, 1],
                          [1, 1, 1, 1, 1]])
    shape = np.array(structure.shape)
    environment = Maze(structure,  tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order 
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4),1)
    greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(' #'))[structure])
    print('Maze map:')
    print('\n'.join(''.join(row) for row in maze))
    print('Greedy policy:')
    print('\n'.join(''.join(row) for row in greedy_policy))
    assert '\n'.join(''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
Пример #4
0
def train():

    # Make the environment
    environment = TwentyFortyEightEnvironment()

    # The task is the game this time
    task = environment

    # Make the reinforcement learning agent (use a network because inputs are continuous)
    network = ActionValueNetwork(task.nSenses, task.nActions)

    # Use Q learning for updating the table (NFQ is for networks)
    learner = NFQ()
    learner.gamma = GAMMA

    agent = LearningAgent(network, learner)

    # Set up an experiment
    experiment = EpisodicExperiment(task, agent)

    # Train the Learner
    meanScores = []
    for i in xrange(LEARNING_EPOCHS):
        experiment.doEpisodes(GAMES_PER_EPOCH)
        print "Iteration ", i, " With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock
        meanScores.append(task.meanScore)
        agent.learn()
        agent.reset()

    params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "gamma": GAMMA }
    return meanScores, params, agent
Пример #5
0
def run_bbox(verbose=False):
    n_features = n_actions = max_time = -1

    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        bbox.load_level("../levels/train_level.data", verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()

    av_table = ActionValueTable(n_features, n_actions)
    av_table.initialize(0.2)
    print av_table._params
    learner = Q(0.5, 0.1)
    learner._setExplorer(EpsilonGreedyExplorer(0.4))
    agent = LearningAgent(av_table, learner)
    environment = GameEnvironment()
    task = GameTask(environment)
    experiment = Experiment(task, agent)

    while environment.finish_flag:
        experiment.doInteractions(1)
        agent.learn()
 
    bbox.finish(verbose=1)
def q_learning_table():
    controller = ActionValueTable(36, 4)
    learner = Q()
    controller.initialize(1.)

    agent = LearningAgent(controller, learner)

    score_list = []
    turn_list  = []
    # neural側のトレーニング分 +100
    for i in range(600):
        print_state(agent.module.getValue, 'table')

        score, turn = play(agent, 'table')
        score_list.append(score)
        turn_list.append(turn)

        agent.learn()
        agent.reset()

        print i, int(numpy.mean(score_list)) , max(score_list), score, turn

        with open('./agent.dump', 'w') as f:
            pickle.dump(agent, f)
        with open('./score.dump', 'w') as f:
            pickle.dump([score_list, turn_list], f)
Пример #7
0
def learn(client):
	av_table = ActionValueNetwork(4, 1)

	learner = Reinforce()
	agent = LearningAgent(av_table, learner)

	env = CarEnvironment(client)
	task = CarTask(env)

	experiment = ContinuousExperiment(task, agent)

	while True:
		experiment.doInteractionsAndLearn(1)
		agent.learn()
Пример #8
0
class QAlgorithm:
  def Pause(self):#if menu says pause pause exicution 
    while self.state == 1:
      time.sleep(.05)
    return True

  def Quit(self):#if menu says quit stop running
    self.process.terminate()
    return False

  def Start(self):#starts the Bot
    if self.process == None:
      self.runBot()
      #self.process = multiprocessing.Process(target=self.runBot, args= [])
      #self.process.start() 
    return True

  def CheckState(self):#checks to see what state the menu says to be in 
    if self.state == 0 :
      self.Start()
    elif self.state == 1:
      self.Pause()
    elif self.state == 2:
      self.Quit()

  def GameOver(self):#checks to see if state requires bot pause, quit or if the game is over
    return self.CheckState() or self.sr.checkEndGame(self.endBox,self.gameOver)

  def __init__(self,rewardBox,box,gameOver,endGame,scoreArea):
    self.reward = rewardBox
    self.bbox = box
    self.environment = TEnviroment(box)#Custom environment class
    if os.path.isfile("bot.txt"):
      self.controller  = pickle.load(open("bot.txt","rb")) 
    else:
      self.controller = ActionValueNetwork(50**2,4)#Arguments (framerate*maxPlaytime, Number of acitons)
    self.learner = Q()
    gf = {0:self.GameOver}
    self.agent = LearningAgent(self.controller, self.learner)
    self.task = TTask(self.environment,scoreArea,gf)#needs custom task
    self.experiment = EpisodicExperiment(self.task, self.agent)
    self.process = None
    self.endBox = endGame

  def runBot(self):#runes the bot for a single Episode
      self.experiment.doEpisodes()
      self.agent.learn()
      self.agent.reset()
      file = open("bot.txt","wb+")
      pickle.dump(self.controller,file)
Пример #9
0
 def learn(self, number_of_iterations):
     learner = Q(0.2, 0.8)
     task = CartMovingTask(self.environment)
     self.controller = ActionValueTable(
         reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)),
         self.force_granularity)
     self.controller.initialize(1.)
     agent = LearningAgent(self.controller, learner)
     experiment = Experiment(task, agent)
     for i in range(number_of_iterations):
         experiment.doInteractions(1)
         agent.learn()
         agent.reset()
     with open("test.pcl", "w+") as f:
         pickle.dump(self.controller, f)
Пример #10
0
 def learn(self, number_of_iterations):
     learner = Q(0.2, 0.8)
     task = CartMovingTask(self.environment)
     self.controller = ActionValueTable(
         reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity
     )
     self.controller.initialize(1.0)
     agent = LearningAgent(self.controller, learner)
     experiment = Experiment(task, agent)
     for i in range(number_of_iterations):
         experiment.doInteractions(1)
         agent.learn()
         agent.reset()
     with open("test.pcl", "w+") as f:
         pickle.dump(self.controller, f)
Пример #11
0
    def maze():
        # import sys, time
        pylab.gray()
        pylab.ion()
        # The goal appears to be in the upper right
        structure = [
            "!!!!!!!!!!",
            "! !  ! ! !",
            "! !! ! ! !",
            "!    !   !",
            "! !!!!!! !",
            "! ! !    !",
            "! ! !!!! !",
            "!        !",
            "! !!!!!  !",
            "!   !    !",
            "!!!!!!!!!!",
        ]
        structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure])
        shape = np.array(structure.shape)
        environment = Maze(structure, tuple(shape - 2))
        controller = ActionValueTable(shape.prod(), 4)
        controller.initialize(1.0)
        learner = Q()
        agent = LearningAgent(controller, learner)
        task = MDPMazeTask(environment)
        experiment = Experiment(task, agent)

        for i in range(100):
            experiment.doInteractions(100)
            agent.learn()
            agent.reset()
            # 4 actions, 81 locations/states (9x9 grid)
            # max(1) gives/plots the biggest objective function value for that square
            pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9))
            pylab.draw()

        # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
        greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
        greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape))
        maze = np.flipud(np.array(list(" #"))[structure])
        print("Maze map:")
        print("\n".join("".join(row) for row in maze))
        print("Greedy policy:")
        print("\n".join("".join(row) for row in greedy_policy))
Пример #12
0
    def maze():
        # import sys, time
        pylab.gray()
        pylab.ion()
        # The goal appears to be in the upper right
        structure = [
            '!!!!!!!!!!',
            '! !  ! ! !',
            '! !! ! ! !',
            '!    !   !',
            '! !!!!!! !',
            '! ! !    !',
            '! ! !!!! !',
            '!        !',
            '! !!!!!  !',
            '!   !    !',
            '!!!!!!!!!!',
            ]
        structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure])
        shape = np.array(structure.shape)
        environment = Maze(structure, tuple(shape - 2))
        controller = ActionValueTable(shape.prod(), 4)
        controller.initialize(1.)
        learner = Q()
        agent = LearningAgent(controller, learner)
        task = MDPMazeTask(environment)
        experiment = Experiment(task, agent)

        for i in range(100):
            experiment.doInteractions(100)
            agent.learn()
            agent.reset()
            # 4 actions, 81 locations/states (9x9 grid)
            # max(1) gives/plots the biggest objective function value for that square
            pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9))
            pylab.draw()

        # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
        greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
        greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape))
        maze = np.flipud(np.array(list(' #'))[structure])
        print('Maze map:')
        print('\n'.join(''.join(row) for row in maze))
        print('Greedy policy:')
        print('\n'.join(''.join(row) for row in greedy_policy))
def main():

    # 2048の全ての状態を保存するのは無理でしょ.
    #   14^16通りの状態があるよね.
    #controller = ActionValueTable(16, 4)
    #learner = Q()
    #controller.initialize(1.)

    controller = ActionValueNetwork(16, 4)
    learner = NFQ()
    #learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(controller, learner)

    score_list = []
    for i in range(10000):
        # if os.path.exists('./agent.dump'):
        #     with open('./agent.dump') as f:
        #         agent = pickle.load(f)

        print i, 'playing ...'
        score = play(agent)
        score_list.append(score)

        # ここで,
        #   TypeError: only length-1 arrays can be converted to Python scalars
        #   pybrain/rl/learners/valuebased/q.py
        #   => learnerをQからNFQにしたら行けた.
        #   => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work
        print i, 'learning ...'
        agent.learn()
        agent.reset()

        print i, 'evaluate sample ...'
        data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 2]]
        agent.integrateObservation(numpy.array(data).ravel())
        move = agent.getAction()
        print "                           ", i, int(
            numpy.mean(score_list)), max(score_list), move

        if i % 20 == 0:
            print i, 'saving ...'
            with open('./agent.dump', 'w') as f:
                pickle.dump(agent, f)
            with open('./score.dump', 'w') as f:
                pickle.dump(score_list, f)
Пример #14
0
def main():
    rospy.init_node("lauron_reinforcement_learning")
    environment = RLEnvironment()
    dim_state = environment.joint_states.shape[0]
    num_actions = len(environment.actions)
    controller = ActionValueNetwork(dim_state, num_actions)
    learner = SARSA()
    agent = LearningAgent(controller, learner)
    task = RLTask(environment)
    experiment = Experiment(task, agent)

    episode_counter = 0
    while True:
        print("Training episode {}".format(episode_counter))
        experiment.doInteractions(NUM_INTERACTIONS)
        agent.learn()
        agent.reset()
        episode_counter += 1
def main():

    # 2048の全ての状態を保存するのは無理でしょ.
    #   14^16通りの状態があるよね.
    #controller = ActionValueTable(16, 4)
    #learner = Q()
    #controller.initialize(1.)

    controller = ActionValueNetwork(16, 4)
    learner = NFQ()
    #learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(controller, learner)

    score_list = []
    for i in range(10000):
        # if os.path.exists('./agent.dump'):
        #     with open('./agent.dump') as f:
        #         agent = pickle.load(f)

        print i, 'playing ...'
        score = play(agent)
        score_list.append(score)

        # ここで,
        #   TypeError: only length-1 arrays can be converted to Python scalars
        #   pybrain/rl/learners/valuebased/q.py
        #   => learnerをQからNFQにしたら行けた.
        #   => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work
        print i, 'learning ...'
        agent.learn()
        agent.reset()

        print i, 'evaluate sample ...'
        data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]]
        agent.integrateObservation(numpy.array(data).ravel())
        move = agent.getAction()
        print "                           ",i, int(numpy.mean(score_list)) , max(score_list), move

        if i % 20 == 0:
            print i, 'saving ...'
            with open('./agent.dump', 'w') as f:
                pickle.dump(agent, f)
            with open('./score.dump', 'w') as f:
                pickle.dump(score_list, f)
def test_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = [
        list('!!!!!!!!!!'),
        list('! !  ! ! !'),
        list('! !! ! ! !'),
        list('!    !   !'),
        list('! !!!!!! !'),
        list('! ! !    !'),
        list('! ! !!!! !'),
        list('!        !'),
        list('! !!!!!  !'),
        list('!   !    !'),
        list('!!!!!!!!!!'),
    ]
    structure = np.array([[ord(c) - ord(' ') for c in row]
                          for row in structure])
    shape = np.array(structure.shape)
    environment = Maze(structure, tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
    greedy_policy = np.flipud(
        np.array(list('NESW'))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(' #'))[structure])
    print('Maze map:')
    print('\n'.join(''.join(row) for row in maze))
    print('Greedy policy:')
    print('\n'.join(''.join(row) for row in greedy_policy))
    assert '\n'.join(
        ''.join(row)
        for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
Пример #17
0
def run():
    """
    number of states is:
    current value: 0-20

    number of actions:
    Stand=0, Hit=1 """

    # define action value table
    av_table = ActionValueTable(MAX_VAL, MIN_VAL)
    av_table.initialize(0.)

    # define Q-learning agent
    q_learner = Q(Q_ALPHA, Q_GAMMA)
    q_learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(av_table, q_learner)

    # define the environment
    env = BlackjackEnv()

    # define the task
    task = BlackjackTask(env, verbosity=VERBOSE)

    # finally, define experiment
    experiment = Experiment(task, agent)

    # ready to go, start the process
    for _ in range(NB_ITERATION):
        experiment.doInteractions(1)
        if task.lastreward != 0:
            if VERBOSE:
                print "Agent learn"
            agent.learn()

    print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|'
    print '|:-------:|:-------|:-----|:-----|'
    for i in range(MAX_VAL):
        print '| %s | %s | %s | %s |' % (
            (i+1),
            av_table.getActionValues(i)[0],
            av_table.getActionValues(i)[1],
            av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1]
        )
Пример #18
0
class RL:
    def __init__(self):
	self.av_table = ActionValueTable(4, 5)
	self.av_table.initialize(0.1)

	learner = SARSA()
	learner._setExplorer(EpsilonGreedyExplorer(0.0))
	self.agent = LearningAgent(self.av_table, learner)

	env = HASSHEnv()

	task = HASSHTask(env)

	self.experiment = Experiment(task, self.agent)

    def go(self):
      global rl_params
      rassh.core.constants.rl_params = self.av_table.params.reshape(4,5)[0]
      self.experiment.doInteractions(1)
      self.agent.learn()
Пример #19
0
class RL:
    def __init__(self):
	self.av_table = ActionValueTable(2, 3)
	self.av_table.initialize(0.1)

	learner = SARSA()
	learner._setExplorer(EpsilonGreedyExplorer(0.0))
	self.agent = LearningAgent(self.av_table, learner)

	env = HASSHEnv()

	task = HASSHTask(env)

	self.experiment = Experiment(task, self.agent)

    def go(self):
      global rl_params
      kippo.core.constants.rl_params = self.av_table.params.reshape(2,3)[0]
      self.experiment.doInteractions(1)
      self.agent.learn()
Пример #20
0
def run():
    """
    number of states is:
    current value: 0-20

    number of actions:
    Stand=0, Hit=1 """

    # define action value table
    av_table = ActionValueTable(MAX_VAL, MIN_VAL)
    av_table.initialize(0.)

    # define Q-learning agent
    q_learner = Q(Q_ALPHA, Q_GAMMA)
    q_learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(av_table, q_learner)

    # define the environment
    env = BlackjackEnv()

    # define the task
    task = BlackjackTask(env, verbosity=VERBOSE)

    # finally, define experiment
    experiment = Experiment(task, agent)

    # ready to go, start the process
    for _ in range(NB_ITERATION):
        experiment.doInteractions(1)
        if task.lastreward != 0:
            if VERBOSE:
                print "Agent learn"
            agent.learn()

    print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|'
    print '|:-------:|:-------|:-----|:-----|'
    for i in range(MAX_VAL):
        print '| %s | %s | %s | %s |' % (
            (i + 1),
            av_table.getActionValues(i)[0], av_table.getActionValues(i)[1],
            av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1])
Пример #21
0
def explore_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = [
        list("!!!!!!!!!!"),
        list("! !  ! ! !"),
        list("! !! ! ! !"),
        list("!    !   !"),
        list("! !!!!!! !"),
        list("! ! !    !"),
        list("! ! !!!! !"),
        list("!        !"),
        list("! !!!!!  !"),
        list("!   !    !"),
        list("!!!!!!!!!!"),
    ]
    structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure])
    shape = np.array(structure.shape)
    environment = Maze(structure, tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.0)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
    greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(" #"))[structure])
    print("Maze map:")
    print("\n".join("".join(row) for row in maze))
    print("Greedy policy:")
    print("\n".join("".join(row) for row in greedy_policy))
    assert "\n".join("".join(row) for row in greedy_policy) == "NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN"
def main():
    client_id = Utils.connectToVREP()

    # Define RL elements
    environment = StandingUpEnvironment(client_id)
    task = StandingUpTask(environment)
    controller = MyActionValueTable()
    learner = Q(0.5, 0.9)
    learner.explorer = EpsilonGreedyExplorer(0.15, 1)  # EpsilonGreedyBoltzmannExplorer()
    agent = LearningAgent(controller, learner)
    experiment = EpisodicExperiment(task, agent)

    controller.initialize(agent)

    i = 0
    try:
        while True:
            i += 1
            print('Episode ' + str(i))
            experiment.doEpisodes()
            agent.learn()
            agent.reset()
            print('mean: '+str(numpy.mean(controller.params)))
            print('max: '+str(numpy.max(controller.params)))
            print('min: '+str(numpy.min(controller.params)))

            if i % 500 == 0:  # Save q-table every 500 episodes
                print('Save q-table')
                controller.save()
                task.t_table.save()

    except (KeyboardInterrupt, SystemExit):
        with open('../data/standing-up-q.pkl', 'wb') as handle:
            pickle.dump(controller.params, handle)
        task.t_table.save()
        controller.save()

    vrep.simxFinish(client_id)
Пример #23
0
class PlayYourCardsRight(Feature):
  
    def __init__(self, text_to_speech, speech_to_text):
        Feature.__init__(self)

        # setup AV Table
        self.av_table = GameTable(13, 2)
        if(self.av_table.loadParameters() == False):
            self.av_table.initialize(0.)
 
        # setup a Q-Learning agent
        learner = Q(0.5, 0.0)
        learner._setExplorer(EpsilonGreedyExplorer(0.0))
        self.agent = LearningAgent(self.av_table, learner)
 
        # setup game interaction
        self.game_interaction = GameInteraction(text_to_speech, speech_to_text)

        # setup environment
        environment = GameEnvironment(self.game_interaction)
 
        # setup task
        task = GameTask(environment, self.game_interaction)
 
        # setup experiment
        self.experiment = Experiment(task, self.agent)
    
    @property
    def is_speaking(self):
        return self.game_interaction.is_speaking

    def _thread(self, args):
        # let's play our cards right!
        while not self.is_stop:
            self.experiment.doInteractions(1)
            self.agent.learn()
            self.av_table.saveParameters()
class PlayYourCardsRight(Feature):
    def __init__(self, text_to_speech, speech_to_text):
        Feature.__init__(self)

        # setup AV Table
        self.av_table = GameTable(13, 2)
        if (self.av_table.loadParameters() == False):
            self.av_table.initialize(0.)

        # setup a Q-Learning agent
        learner = Q(0.5, 0.0)
        learner._setExplorer(EpsilonGreedyExplorer(0.0))
        self.agent = LearningAgent(self.av_table, learner)

        # setup game interaction
        self.game_interaction = GameInteraction(text_to_speech, speech_to_text)

        # setup environment
        environment = GameEnvironment(self.game_interaction)

        # setup task
        task = GameTask(environment, self.game_interaction)

        # setup experiment
        self.experiment = Experiment(task, self.agent)

    @property
    def is_speaking(self):
        return self.game_interaction.is_speaking

    def _thread(self, args):
        # let's play our cards right!
        while not self.is_stop:
            self.experiment.doInteractions(1)
            self.agent.learn()
            self.av_table.saveParameters()
Пример #25
0
def roundrobin(case, learners, profile, m, nb, ns, mx, weeks, days,
               outdir="/tmp", dc=True, trial=0):
    np = len(profile)

    adj = "dc" if dc else "ac"
    market = SmartMarket(case, priceCap=100.0, decommit=True,
                         locationalAdjustment=adj)

    for i, perms in enumerate(itertools.permutations(learners)):
        experiment = MarketExperiment([], [], market, profile)

        for j, learner in enumerate(perms):
            gens = case.generators[j:j + 1]

            if isinstance(learner, ValueBasedLearner):
                # Comment out for stateful Roth-Erev learner.
                nstates = 1 if isinstance(learner, RothErev) else ns

                env = discrete.MarketEnvironment(gens, market,
                                                 markups=m,
                                                 numStates=nstates,
                                                 numOffbids=nb)
                task = discrete.ProfitTask(env, maxSteps=np)

                na = len(env._allActions)
                module = ActionValueTable(numStates=nstates, numActions=na)

            elif isinstance(learner, DirectSearchLearner):
                env = continuous.MarketEnvironment(gens, market, nb, mx)

                task = continuous.ProfitTask(env, maxSteps=np)

                module = buildNetwork(env.outdim, 2, env.indim,
                                      bias=True, outputbias=True,
                                      hiddenclass=TanhLayer,outclass=TanhLayer)
            else:
                raise ValueError

            agent = LearningAgent(module, learner)
            experiment.tasks.append(task)
            experiment.agents.append(agent)

        all_states = zeros((3, 0))
        all_actions = zeros((3, 0))
        all_rewards = zeros((3, 0))
        comments = ["Trial: %d, Perm: %d" % (trial, i)]
        for task, agent in zip(experiment.tasks, experiment.agents):
            g = task.env.generators[0]
            l = agent.learner.__class__.__name__
            comments.append("(%s, %s)" % (g.name, l))
        c = ", ".join(comments)

        for _ in range(weeks):
            experiment.doEpisodes(days)

            states = zeros((0, days * np))
            actions = zeros((0, days * np))
            rewards = zeros((0, days * np))
            for _, agent in enumerate(experiment.agents):
                states = r_[states, agent.history["state"].T]
                actions = r_[actions, agent.history["action"].T]
                rewards = r_[rewards, agent.history["reward"].T]

                agent.learn()
                agent.reset()

            all_states = c_[all_states, states]
            all_actions = c_[all_actions, actions]
            all_rewards = c_[all_rewards, rewards]

        mmwrite(join(outdir, "state_%d_%d.mtx" % (trial, i)), all_states, c)
        mmwrite(join(outdir, "action_%d_%d.mtx" % (trial, i)), all_actions, c)
        mmwrite(join(outdir, "reward_%d_%d.mtx" % (trial, i)), all_rewards, c)
Пример #26
0
performance = []

sv_fig = plt.figure()
pf_fig = plt.figure()

# experiment.doEpisodes(50)
    
while(True):
    env.delay = True
    experiment.doEpisodes(10)
    env.delay = False

    while agent.history.getNumSequences() > 50:
        agent.history.removeSequence(0)
        
    agent.learn(20)
    
    experiment.agent = testagent
    r = mean([sum(x) for x in experiment.doEpisodes(20)])

    testagent.reset()
    experiment.agent = agent
    

    performance.append(r) 
    plotStateValues(module, sv_fig)
    plotPerformance(performance, pf_fig)
    print "reward avg", r
    print "params", agent.module.network.params
    # print "exploration", agent.learner.explorer.epsilon
    print "num samples", agent.history.getNumSequences()
class SimulationMaster:

    def __init__(self, n_threads=4, initial_port=19997, q_table_version=0,
                 batch_size=None, learner=None, explorer=None):
        self.barrier = Barrier(n_threads + 1, timeout=720)
        self.n_threads = n_threads
        self.initial_port = initial_port
        self.batch_size = batch_size

        self.controller = MyActionValueTable(q_table_version)
        if learner is None:
            self.learner = Q(0.5, 0.9)
        else:
            self.learner = learner

        if explorer is None:
            self.explorer = self.learner.explorer = EpsilonGreedyExplorer(0.2, 0.998)
        else:
            self.explorer = self.learner.explorer = explorer
        self.agent = LearningAgent(self.controller, self.learner)
        # Logger initialization
        self.logger = logging.getLogger('master_logger')
        self.logger.setLevel(logging.DEBUG)
        self.logger.addHandler(logging.FileHandler(Utils.DATA_PATH + 'learning-tables/master.log'))
        self.failed_simulations = []
        self.n_episodes = 0
        self.simulations = []
        self.initialize_simulations()

    def initialize_simulations(self):
        self.simulations = []
        for i in range(self.n_threads):
            if self.batch_size is not None:
                self.simulations.append(Simulation(self, self.initial_port + i, self.batch_size))
            else:
                self.simulations.append(Simulation(self, self.initial_port + i))

    def get_action(self, observation):
        action = self.controller.activate(observation)
        action = self.explorer.activate(observation, action)
        return action

    def add_observation(self, obs):
        """
            Adds observation in the agent memory
            :param obs: 3 dimensional vector containing [observation, action, reward]
        """
        self.agent.integrateObservation(obs[0])
        self.agent.lastaction = obs[1]
        self.agent.giveReward(obs[2])

    def update_q_table(self):
        """
            Updates the q table with the new simulators observations
        """
        for sim in self.simulations:
            for trace in sim.traces:
                for obs in trace:
                    self.add_observation(obs)
                self.agent.learn()
                self.agent.reset()
                self.n_episodes += 1

            sim.traces.clear()
        if self.explorer.epsilon > 0.1:
      	    self.explorer.epsilon=self.explorer.epsilon*self.explorer.decay
        if self.learner.alpha > 0.1:
            self.learner.alpha *= 0.999
        self.logger.info('new epsilon: {}'.format(self.explorer.epsilon))
        self.logger.info('new alpha: {}'.format(self.learner.alpha))
        self.logger.info('n episodes: {}'.format(self.n_episodes))

    def save_t_table(self):
        """
            Saves t tables, one for each thread
        """
        for sim in self.simulations:
            sim.save_t_table()

    def run(self):

        self.controller.initialize(self.agent)
        for sim in self.simulations:
            sim.start()
        counter = 0
        while True:
            try:
                self.barrier.wait()  # wait until all simulations are done
                self.update_q_table()
                self.save_t_table()
                self.barrier.wait()  # Free simulations threads and start a new cycle
                # Counter to avoid to save q-table too often
                if counter == 5:
                    self.controller.save()
                    counter = 0
                else:
                    counter += 1
                while self.failed_simulations:
                    sim = self.failed_simulations.pop()
                    self.restart_simulation(sim)
            except BrokenBarrierError as e:
                self.logger.error('Broken Barrier Error Occurred')
                for sim in self.simulations:
                    sim.stop()
                for sim in self.simulations:
                    sim.join()
                del self.simulations
                self.initialize_simulations()
                self.barrier.reset()
                self.failed_simulations.clear()
                for sim in self.simulations:
                    sim.start()

    def restart_simulation(self, simulation):
        self.logger.info('Restarting simulation with port {}'.format(simulation.port))
        self.simulations.remove(simulation)
        new_simulation = Simulation(self, simulation.port)
        self.simulations.append(new_simulation)
        new_simulation.start()
        del simulation
Пример #28
0
def rl_optimizer(UImatrix, actionmatrix, actions_in_uis, actions_penalty,
                 num_of_actions, params):
    policies = [([])] * params.num_states

    # Defining UI environment
    goal = 1  # Default goal
    ui_env = UI(UImatrix, actionmatrix, actions_in_uis, actions_penalty, goal,
                params)
    av_table = ActionValueTable(params.num_states, num_of_actions)

    klm_tot = 0
    klm_avg = 0
    p_learned = 1
    modality_table_total = np.array([0, 0, 0])
    klm_total = 0

    # Train agent for each goal
    for g in range(0, ui_env.num_of_states):

        ##############################################
        # Define Q-learning agent
        learner = Q(0.5, 0.9)
        learner.explorer.epsilon = 0.7
        learner.explorer.decay = 0.999
        learner.explorer.env = ui_env
        agent = LearningAgent(av_table, learner)

        # Define task and experiment
        task = UITask(ui_env)
        experiment = EpisodicExperiment(task, agent)

        # Initialze av table. Removes not allowed actions.
        av_table.initialize(1., actionmatrix)

        # Set goal
        experiment.task.env.setGoal(g)

        for j in range(50):

            initial_state = mod(j, ui_env.num_of_states)
            if initial_state == g: continue
            experiment.task.env.setInitialState(initial_state)

            runs = 50
            experiment.doEpisodes(runs)

            agent.learn()
            agent.reset()

        ##############################################
        # Evaluation of UI and policy for the current goal
        # Iterate to get average - use only if errors used
        total_iterations = 10
        klm_tot = 0
        for i in range(total_iterations):
            # KLM value
            klm_g, modality_table = evaluation(av_table, ui_env, g, params)

            # Not learned
            if klm_g == -1:
                klm_tot += 20 * 5
                p_learned = 0
                return -1, 0, 0, 0, 0

            # Save to total KLM
            klm_tot += klm_g / (params.num_states - 1)

        klm_avg += klm_tot / total_iterations

        modality_table_total += np.array(modality_table)
        klm_total += klm_avg

        if p_learned == 0: break

    return modality_table_total, klm_total
Пример #29
0
def Py_Brain():
    ############################
    # pybrain
    ############################
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap
    import itertools
    from scipy import linalg

    from pybrain.rl.environments.mazes import Maze, MDPMazeTask
    from pybrain.rl.learners.valuebased import ActionValueTable
    from pybrain.rl.agents import LearningAgent
    from pybrain.rl.learners import Q, SARSA
    from pybrain.rl.experiments import Experiment
    from pybrain.rl.environments import Task

    import pylab
    #pylab.gray()
    #pylab.ion()

    '''
    structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
                          [1, 0, 0, 1, 0, 0, 0, 0, 1],
                          [1, 0, 0, 1, 0, 0, 1, 0, 1],
                          [1, 0, 0, 1, 0, 0, 1, 0, 1],
                          [1, 0, 0, 1, 0, 1, 1, 0, 1],
                          [1, 0, 0, 0, 0, 0, 1, 0, 1],
                          [1, 1, 1, 1, 1, 1, 1, 0, 1],
                          [1, 0, 0, 0, 0, 0, 0, 0, 1],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1]])
    '''
    structure = np.array([[1, 1, 1, 1, 1],
                          [1, 1, 0, 0, 1],
                          [1, 1, 0, 1, 1],
                          [1, 0, 0, 1, 1],
                          [1, 1, 1, 1, 1]])

    num_states = int(structure.shape[0]*structure.shape[1])
    SQRT = int(math.sqrt(num_states))
    #print structure.item((1, 3))
    #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple
    environment = Maze(structure, (1, 3)) #second parameter is goal field tuple
    print type(environment)
    print environment
    # Standard maze environment comes with the following 4 actions:
    # North, South, East, West
    controller = ActionValueTable(num_states, 4) #[N, S, E, W] 
    controller.initialize(1)

    learner = Q()
    agent = LearningAgent(controller, learner)
    np.not_equal(agent.lastobs, None)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    #while True:
    for x in range(4):
        print x
        experiment.doInteractions(10)
        agent.learn()
        agent.reset()

        pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT))
        pylab.draw()
        #pylab.show()
        name='MAZE'
        plt.savefig(str(name)+'_PLOT.png')
    plt.close()
Пример #30
0
def rl_optimizer(num_of_actions, params):

    # Defining UI environment
    actionmatrix = range(num_of_actions)
    goal = False # Default goal
    ui_env = UI(params.num_states, actionmatrix, num_of_actions, goal, params.sensor_errors, params.confusion_error, params.penalties, params.grid, params.dimensions, params.init_position, params.goal_position)
    av_table = ActionValueTable(params.num_states, num_of_actions, ui_env)
    av_table.initialize(1)

    # Train agent for each goal
    klm_tot = 0
    klm_avg = 0
    p_learned = 1


    ##############################################
    # Define Q-learning agent
    learner = Q(0.5, 0.99) 
    learner.explorer.epsilon = 0.7 
    learner.explorer.decay = 0.999
    learner.explorer.env = ui_env
    agent = LearningAgent(av_table, learner)

    #Initialize av table. Give action matrix as an input.
    av_table.initialize(-5., actionmatrix) 

    # Define task and experiment
    task = UITask(ui_env)
    experiment = EpisodicExperiment(task, agent, av_table)

    ##############################################
    # Training Agent
    for j in range(8): # Learning iterations

        runs = 50 # Episodes in one iteration
        experiment.doEpisodes(runs) 
        
        agent.learn()
        agent.reset()


    ##############################################
    # Evaluation of UI and policy for current goal
    # Loop to get average : use only if errors used
    klm_tasks_tot = np.array([0.]*(params.num_states-1))
    total_iterations = 1
    klm_tot = 0
    for i in range(total_iterations):
        # KLM value
        klm_g, best_path = evaluation(av_table, ui_env, task, False, params)
        if klm_g == -1: # Not learned
            klm_tot += 20*5
            p_learned = 0
            print "Policy not learned"
            break
        # Save to total KLM
        klm_tot += klm_g
    # Average KLM estimate
    klm_avg += klm_tot/total_iterations

    return best_path, klm_avg
Пример #31
0
    plt.figure(fig.number)
    plt.clf()
    plt.plot(values, 'o-')
    plt.gcf().canvas.draw()
    # Without the next line, the pyplot plot won't actually show up.
    plt.pause(0.001)

performance = []

if not render:
    pf_fig = plt.figure()

while(True):
	# one learning step after one episode of world-interaction
    experiment.doEpisodes(1)
    agent.learn(1)

    # test performance (these real-world experiences are not used for training)
    if render:
        env.delay = True
    experiment.agent = testagent
    r = mean([sum(x) for x in experiment.doEpisodes(5)])
    env.delay = False
    testagent.reset()
    experiment.agent = agent

    performance.append(r)
    if not render:
        plotPerformance(performance, pf_fig)

    print("reward avg", r)
Пример #32
0
from pybrain.rl.experiments import Experiment

envmatrix = array([[1,1,1,1,1,1,1,1,1],
                   [1,0,0,1,0,0,0,0,1],
                   [1,0,0,1,0,0,1,0,1],
                   [1,0,0,1,0,0,1,0,1],
                   [1,0,0,1,0,1,1,0,1],
                   [1,0,0,0,0,0,1,0,1],
                   [1,1,1,1,1,1,1,0,1],
                   [1,0,0,0,0,0,0,0,1],
                   [1,1,1,1,1,1,1,1,1]])
environment = Maze(envmatrix, (7,7))
task = MDPMazeTask(environment)

table = ActionValueTable(81,4)
table.initialize(1.)

agent = LearningAgent(table,Q())

experiment = Experiment(task,agent)

plt.ion()
plt.gray()

for i in range(1000):
    experiment.doInteractions(100);
    agent.learn();
    agent.reset();
    plt.pcolor(table.params.reshape(81,4).max(axis=1).reshape(9,9))
    plt.gcf().canvas.draw()
Пример #33
0
  mimicTable.initialize(0.)

  mimicLearner = Q(ALPHA, GAMMA)
  mimicLearner._setExplorer(EpsilonGreedyExplorer(EPSILON))
  mimicAgent = LearningAgent(mimicTable, mimicLearner)

  mimicEnv = MimicryPreyEnvironment(world)
  mimicTask = MimicryPreyTask(mimicEnv)
  mimicExp = Experiment(mimicTask, mimicAgent)

  try:
    for t in xrange(MAX_TIME):
      print 't = %d' % t 
      world.t = t
      predExp.doInteractions(1)
      predAgent.learn()
      mimicExp.doInteractions(1)
      mimicAgent.learn()
      print 'Mimicker Colors vs. Q-table:'
      table_print(mimicTable._params, MimicryPreyInteraction.NSTATES)
      print 'Predator Colors vs. Q-table:'
      table_print(predTable._params, PredatorInteraction.NSTATES)
      print

  except KeyboardInterrupt:
    pass

  finally:
    print 'Background: %s' % BKGD_COLOR
    print 'Predator Colors vs. Final Q-table:'
    table_print(predTable._params, PredatorInteraction.NSTATES)
Пример #34
0
# define action-value table
# number of states is:
#
#    current value: 1-21
#
# number of actions:
#
#    Stand=0, Hit=1
av_table = ActionValueTable(21, 2)
av_table.initialize(0.)

# define Q-learning agent
learner = Q(0.5, 0.0)
learner._setExplorer(EpsilonGreedyExplorer(0.0))
agent = LearningAgent(av_table, learner)

# define the environment
env = BlackjackEnv()

# define the task
task = BlackjackTask(env)

# finally, define experiment
experiment = Experiment(task, agent)

# ready to go, start the process
while True:
    experiment.doInteractions(1)
    agent.learn()
    agent.reset()
Пример #35
0
  predTable.initialize(0.)

  predLearner = Q(ALPHA, GAMMA)
  predLearner._setExplorer(EpsilonGreedyExplorer(EPSILON))
  predAgent = LearningAgent(predTable, predLearner)

  predEnv = PredatorEnvironment(world)
  predTask = PredatorTask(predEnv)
  predExp = Experiment(predTask, predAgent)

  try:
    for t in xrange(MAX_TIME):
      print 't = %d' % t 
      world.t = t
      predExp.doInteractions(1)
      predAgent.learn()
      print 'Colors vs. Q-table:'
      table_print(predTable._params, PredatorInteraction.NSTATES)
      print

  except KeyboardInterrupt:
    pass

  finally:
    print 'Background: %s' % BKGD_COLOR
    print 'Colors vs. Final Q-table:'
    table_print(predTable._params, PredatorInteraction.NSTATES)
    print

    counts = {'ate' : {}, 'poison' : 0, 'death' : 0, 'poisondeath' : 0, 'rejected' : {}}
Пример #36
0
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", parameters
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed + process_id)


    
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, parameters["MaxRunsPerEpisode"])
    testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"])

    #print "dim: ", task.indim, task.outdim
    
    # to inputs state and 4 actions
    module = ActionValueNetwork(task.outdim, task.indim)
    

    learner = NFQ()
    # % of random actions
    learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = LearningAgent(module, learner)
    testagent = LearningAgent(module, None)
    experiment = EpisodicExperiment(task, agent)
    testexperiment = EpisodicExperiment(testtask, testagent)

    
    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    for episode in range(0,m):
    	# one learning step after one episode of world-interaction
        experiment.doEpisodes(parameters["EpisodesPerLearn"])
        agent.learn(1)
    
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        
        if (episode) % parameters["TestAfter"] == 0:
            #print "Evaluating at episode: ", episode
            
            #experiment.agent = testagent
            r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])])
            
            env.delay = False
            testagent.reset()
            #experiment.agent = agent
        
            performance.append(r)
            if plot:
                plotPerformance(performance, pf_fig)
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
#    print "done"
    return performance
            
        #print "network",   json.dumps(module.bn.net.E, indent=2)
Пример #37
0
weeks = 52 * 2
days = 5  # number of samples per gradient estimate
for week in range(weeks):
    all_rewards = experiment.doEpisodes(number=days)
    tot_reward = numpy.mean(agent.history.getSumOverSequences('reward'))

    #    print learner._allEvaluations#[-:-1]

    # Plot the reward at each period averaged over the week.
    r = -1.0 * numpy.array(all_rewards).reshape(days, nf)
    avg_r = numpy.mean(r, 0)
    plot.setData(5, rday, avg_r)

    # Plot the set-point of each generator on the last day of the week.
    # FIXME: Plot the set-points averaged over the week.
    for i in range(len(case.online_generators)):
        scale_factor = 10
        #        plot.setData(i, rday, env._Pg[i, :] * scale_factor)
        plot.setData(i, rday, experiment.Pg[i, :] * scale_factor)

    agent.learn()
    agent.reset()

    # Scale sigma manually.
    sigma = [(sig * 0.95) - 0.05 for sig in sigma]
    learner.explorer.sigma = sigma

    plot.update()

pylab.savefig("/tmp/rlopf.png")
Пример #38
0
    plt.clf()
    plt.plot(values, 'o-')
    plt.gcf().canvas.draw()
    # Without the next line, the pyplot plot won't actually show up.
    plt.pause(0.001)


performance = []

if not render:
    pf_fig = plt.figure()

while (True):
    # one learning step after one episode of world-interaction
    experiment.doEpisodes(1)
    agent.learn(1)

    # test performance (these real-world experiences are not used for training)
    if render:
        env.delay = True
    experiment.agent = testagent
    r = mean([sum(x) for x in experiment.doEpisodes(5)])
    env.delay = False
    testagent.reset()
    experiment.agent = agent

    performance.append(r)
    if not render:
        plotPerformance(performance, pf_fig)

    print("reward avg", r)
Пример #39
0
def rl_optimizer(UImatrix, buttonmatrix, num_of_actions, top_UI, params,
                 batch_num, logging):

    # Defining UI environment
    goal = 1  # Default goal
    ui_env = UI(UImatrix, buttonmatrix, goal, params.sensor_errors,
                params.confusion_error, params.penalties)
    av_table = ActionValueTable(params.num_states, num_of_actions)

    # Check which actions are allowed
    actions_index = [[]] * params.num_states
    bad_actions_idx = []

    for j in range(0, params.num_states):
        for i in range(0, params.num_states):
            if i == j: continue
            if UImatrix[j][i] == 1:
                actions_index[j] = actions_index[j] + [1, 1, 1]
                bad_actions_idx = bad_actions_idx + [0, 0, 0]
            else:
                actions_index[j] = actions_index[j] + [0, 0, 0]
                bad_actions_idx = bad_actions_idx + [1, 1, 1]

    ui_env.actions_index = actions_index
    bad_actions = np.nonzero(np.array(bad_actions_idx))

    # Train agent for each goal
    klm_tot = 0
    klm_avg = 0
    policies = []
    objective = -1
    p_learned = 1
    ii = 0
    for g in range(0, ui_env.num_of_states):

        ##########################
        # Define Q-learning agent
        ##########################
        learner = Q(0.5, 0.9)
        learner.explorer.epsilon = 0.3
        learner.explorer.decay = 0.999
        learner.explorer.actions_index = actions_index
        learner.explorer.env = ui_env
        agent = LearningAgent(av_table, learner)

        ##########################
        # Define task and experiment
        ##########################
        task = UITask(ui_env)
        experiment = EpisodicExperiment(task, agent)

        # Set low values for not allowed actions
        bad_actions = np.ones([ui_env.num_of_states, ui_env.num_of_states])
        for idx_state in range(ui_env.num_of_states):
            for idx_button in range(len(buttonmatrix[idx_state])):
                bad_actions[idx_state, buttonmatrix[idx_state]] = 0
        bad_actions = np.reshape(bad_actions,
                                 ui_env.num_of_states * ui_env.num_of_states)
        bad_actions = np.nonzero(bad_actions)

        av_table.initialize(1., bad_actions[0])  #Removed bad actions

        # Initialize saved N av_tables
        convergence_N = 1  # Move to params
        av_tables_save = [] * convergence_N

        # Set goal
        experiment.task.env.setGoal(g)

        ##########################
        # Trainin agent
        ##########################
        # Add more iterations and runs if not learning.
        for j in range(10):

            initial_state = mod(j, ui_env.num_of_states)
            if initial_state == g: continue
            experiment.task.env.setInitialState(initial_state)

            runs = 15
            experiment.doEpisodes(runs)

            agent.learn()
            agent.reset()

        ##########################
        # Save policy
        ##########################
        p = list(av_table.params)  # Copies to new memory slot
        policies.append(p)

        ##############################################
        # Evaluation of UI and policy for current goal
        ##############################################
        # Loop to get average : use only if errors used
        klm_tasks_tot = np.array([0.] * (params.num_states - 1))
        total_iterations = 15
        klm_tot = 0
        for i in range(total_iterations):
            # KLM value
            klm_g = evaluation(av_table, ui_env, g, params, batch_num, logging)
            if klm_g == -1:  # Not learned
                klm_tot += 20 * 5
                p_learned = 0
                break
            # Save to total KLM
            klm_tot += klm_g / (params.num_states - 1)
        klm_avg += params.state_probs[g] * klm_tot / total_iterations

        if p_learned == 0: break

    if p_learned == 1:  # Policy learned
        ##########################
        # Consistency
        ##########################
        consistency = 0
        idx = 0
        transitions_state = np.sum(UImatrix, 0)
        buttons_to_states = np.zeros(
            [params.num_states,
             params.num_states])  # Which buttons reach to the state goal
        for sr in range(params.num_states):
            for sc in range(params.num_states):
                if UImatrix[sr, sc] == 1:
                    buttons_to_states[sc][buttonmatrix[sr][idx]] += 1
                    idx = idx + 1
            idx = 0
        for s in range(params.num_states):
            for act in range(params.num_states):
                if buttons_to_states[s][act] > 0:
                    consistency += math.log(buttons_to_states[s][act] /
                                            transitions_state[s])

        ##########################
        # Objective function
        ##########################
        objective = params.w_klm * klm_avg
        objective = objective - 1 * params.w_const * consistency
        objective = objective + params.w_simpl * math.log(
            np.sum(np.sum(UImatrix, 1)))
        objective_func = [
            klm_avg, consistency,
            math.log(np.sum(np.sum(UImatrix, 1)))
        ]

        ##########################
        # Save the best
        ##########################
        top_UI.append([
            UImatrix, buttonmatrix, policies, objective, objective_func,
            klm_avg
        ])
        if len(top_UI) > params.top:
            top_UI = sorted(top_UI, key=op.itemgetter(3))[:params.top]

    return top_UI, objective
Пример #40
0
def rl_optimizer(UImatrix, num_of_actions, top_UI, params, batch_num, logging):

    #global policies
    #global top_UI
    policies = [([])] * params.num_states
    num_states = params.num_states

    # Defining UI environment
    actionmatrix = range(num_of_actions)
    goal = False  # Default goal
    ui_env = UI(num_states, actionmatrix, num_of_actions, goal,
                params.sensor_errors, params.confusion_error, params.penalties,
                params.grid, params.dimensions, params.init_position,
                params.goal_position)
    av_table = ActionValueTable(num_states, num_of_actions, ui_env)
    av_table.initialize(1)

    # Train agent for each goal
    klm_tot = 0
    klm_avg = 0
    policies = []
    best_actions = []  # [0]*ui_env.num_of_states*(ui_env.num_of_states-1)
    objective = -1
    p_learned = 1
    ii = 0

    #########
    # Define Q-learning agent
    learner = Q(0.5, 0.99)  #Q(0.6, 0.99) # 0.5, 0.99
    learner.explorer.epsilon = 0.7  # 0.7 # 0.9
    learner.explorer.decay = 0.999  # 0.99
    learner.explorer.env = ui_env
    agent = LearningAgent(av_table, learner)

    # Define task and experiment
    task = UITask(ui_env)
    experiment = EpisodicExperiment(task, agent, av_table)
    #######

    #Removed bad actions, give action matric as input
    av_table.initialize(-5., actionmatrix)

    for j in range(8):  # Learning iterations

        initial_state = 0

        runs = 50  # Episodes in one iteration
        experiment.doEpisodes(runs)

        agent.learn()
        agent.reset()

        ##############################################
        # Save policy
        # For optimization
        p = list(av_table.params)  # Copies to new memory slot
        policies.append(p)

    ##############################################
    # Evaluation of UI and policy for current goal
    # Loop to get average : use only if errors used

    klm_tasks_tot = np.array([0.] * (params.num_states - 1))
    total_iterations = 1
    klm_tot = 0
    for i in range(total_iterations):
        # KLM value
        klm_g, best_path = evaluation(av_table, ui_env, task, False, params,
                                      batch_num, logging)
        if klm_g == -1:  # Not learned
            klm_tot += 20 * 5
            p_learned = 0
            print "error"
            break
        # Save to total KLM
        klm_tot += klm_g
    klm_avg += klm_tot / total_iterations

    return top_UI, objective, best_actions, best_path, klm_g