示例#1
0
def test_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = np.array([[1, 1, 1, 1, 1],
                          [1, 0, 0, 0, 1],
                          [1, 0, 1, 0, 1],
                          [1, 0, 1, 0, 1],
                          [1, 1, 1, 1, 1]])
    shape = np.array(structure.shape)
    environment = Maze(structure,  tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order 
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4),1)
    greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(' #'))[structure])
    print('Maze map:')
    print('\n'.join(''.join(row) for row in maze))
    print('Greedy policy:')
    print('\n'.join(''.join(row) for row in greedy_policy))
    assert '\n'.join(''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
class Team(object):
    def __init__(self, living, task, learner = ENAC()):
        self.living = living
        self.task = task
        self.last_reward = 0
        self.agent = LearningAgent(self.living.brain, learner)
        self.oldparams = self.living.brain.params
    def Interaction(self):
        self.agent.integrateObservation(self.task.getObservation())
        self.task.performAction(self.agent.getAction())
        self.last_reward = self.task.getReward()
        self.agent.giveReward(self.last_reward)
        
        finished = self.task.isFinished()
        if finished:
            #print task.cumreward
            self.agent.newEpisode()
            self.task.reset()
        return self.last_reward, finished
    
    def Learn(self, episodes = 1):    
        self.agent.learn(episodes)
        self.agent.reset()
                        
        newparams = self.living.brain.params.copy() #get_all_weights(eater.brain)[:]
        dif = 0
        j = 0
        for i in newparams:
            dif += (self.oldparams[j] - newparams[j])**2
            j += 1
        self.oldparams = newparams
        return dif
def q_learning_table():
    controller = ActionValueTable(36, 4)
    learner = Q()
    controller.initialize(1.)

    agent = LearningAgent(controller, learner)

    score_list = []
    turn_list = []
    # neural側のトレーニング分 +100
    for i in range(600):
        print_state(agent.module.getValue, 'table')

        score, turn = play(agent, 'table')
        score_list.append(score)
        turn_list.append(turn)

        agent.learn()
        agent.reset()

        print i, int(numpy.mean(score_list)), max(score_list), score, turn

        with open('./agent.dump', 'w') as f:
            pickle.dump(agent, f)
        with open('./score.dump', 'w') as f:
            pickle.dump([score_list, turn_list], f)
示例#4
0
def train():

    # Make the environment
    environment = TwentyFortyEightEnvironment()

    # The task is the game this time
    task = environment

    # Make the reinforcement learning agent (use a network because inputs are continuous)
    network = ActionValueNetwork(task.nSenses, task.nActions)

    # Use Q learning for updating the table (NFQ is for networks)
    learner = NFQ()
    learner.gamma = GAMMA

    agent = LearningAgent(network, learner)

    # Set up an experiment
    experiment = EpisodicExperiment(task, agent)

    # Train the Learner
    meanScores = []
    for i in xrange(LEARNING_EPOCHS):
        experiment.doEpisodes(GAMES_PER_EPOCH)
        print "Iteration ", i, " With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock
        meanScores.append(task.meanScore)
        agent.learn()
        agent.reset()

    params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "gamma": GAMMA }
    return meanScores, params, agent
def main():
    # if os.path.exists('./agent.dump'):
    #     with open('./agent.dump') as f:
    #         agent = pickle.load(f)
    # else:
    controller = ActionValueNetwork(9, 4)
    learner = NFQ()
    agent = LearningAgent(controller, learner)

    score_list = []
    for i in range(10000):

        score = play(agent)
        score_list.append(score)

        # ここで,
        #   TypeError: only length-1 arrays can be converted to Python scalars
        #   pybrain/rl/learners/valuebased/q.py
        #   => learnerをQからNFQにしたら行けた.
        #   => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work

        #agent.learn()
        agent.reset()

        #data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]]
        data =[[0,0,2], [0,0,0], [0,0,2]]
        agent.integrateObservation(numpy.array(data).ravel())
        move = agent.getAction()
        print i, int(numpy.mean(score_list)) , max(score_list), move

        with open('./agent.dump', 'w') as f:
            pickle.dump(agent, f)
        with open('./score.dump', 'w') as f:
            pickle.dump(score_list, f)
def q_learning_table():
    controller = ActionValueTable(36, 4)
    learner = Q()
    controller.initialize(1.)

    agent = LearningAgent(controller, learner)

    score_list = []
    turn_list  = []
    # neural側のトレーニング分 +100
    for i in range(600):
        print_state(agent.module.getValue, 'table')

        score, turn = play(agent, 'table')
        score_list.append(score)
        turn_list.append(turn)

        agent.learn()
        agent.reset()

        print i, int(numpy.mean(score_list)) , max(score_list), score, turn

        with open('./agent.dump', 'w') as f:
            pickle.dump(agent, f)
        with open('./score.dump', 'w') as f:
            pickle.dump([score_list, turn_list], f)
示例#7
0
class QAlgorithm:
  def Pause(self):#if menu says pause pause exicution 
    while self.state == 1:
      time.sleep(.05)
    return True

  def Quit(self):#if menu says quit stop running
    self.process.terminate()
    return False

  def Start(self):#starts the Bot
    if self.process == None:
      self.runBot()
      #self.process = multiprocessing.Process(target=self.runBot, args= [])
      #self.process.start() 
    return True

  def CheckState(self):#checks to see what state the menu says to be in 
    if self.state == 0 :
      self.Start()
    elif self.state == 1:
      self.Pause()
    elif self.state == 2:
      self.Quit()

  def GameOver(self):#checks to see if state requires bot pause, quit or if the game is over
    return self.CheckState() or self.sr.checkEndGame(self.endBox,self.gameOver)

  def __init__(self,rewardBox,box,gameOver,endGame,scoreArea):
    self.reward = rewardBox
    self.bbox = box
    self.environment = TEnviroment(box)#Custom environment class
    if os.path.isfile("bot.txt"):
      self.controller  = pickle.load(open("bot.txt","rb")) 
    else:
      self.controller = ActionValueNetwork(50**2,4)#Arguments (framerate*maxPlaytime, Number of acitons)
    self.learner = Q()
    gf = {0:self.GameOver}
    self.agent = LearningAgent(self.controller, self.learner)
    self.task = TTask(self.environment,scoreArea,gf)#needs custom task
    self.experiment = EpisodicExperiment(self.task, self.agent)
    self.process = None
    self.endBox = endGame

  def runBot(self):#runes the bot for a single Episode
      self.experiment.doEpisodes()
      self.agent.learn()
      self.agent.reset()
      file = open("bot.txt","wb+")
      pickle.dump(self.controller,file)
示例#8
0
 def learn(self, number_of_iterations):
     learner = Q(0.2, 0.8)
     task = CartMovingTask(self.environment)
     self.controller = ActionValueTable(
         reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)),
         self.force_granularity)
     self.controller.initialize(1.)
     agent = LearningAgent(self.controller, learner)
     experiment = Experiment(task, agent)
     for i in range(number_of_iterations):
         experiment.doInteractions(1)
         agent.learn()
         agent.reset()
     with open("test.pcl", "w+") as f:
         pickle.dump(self.controller, f)
示例#9
0
 def learn(self, number_of_iterations):
     learner = Q(0.2, 0.8)
     task = CartMovingTask(self.environment)
     self.controller = ActionValueTable(
         reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity
     )
     self.controller.initialize(1.0)
     agent = LearningAgent(self.controller, learner)
     experiment = Experiment(task, agent)
     for i in range(number_of_iterations):
         experiment.doInteractions(1)
         agent.learn()
         agent.reset()
     with open("test.pcl", "w+") as f:
         pickle.dump(self.controller, f)
def main():

    # 2048の全ての状態を保存するのは無理でしょ.
    #   14^16通りの状態があるよね.
    #controller = ActionValueTable(16, 4)
    #learner = Q()
    #controller.initialize(1.)

    controller = ActionValueNetwork(16, 4)
    learner = NFQ()
    #learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(controller, learner)

    score_list = []
    for i in range(10000):
        # if os.path.exists('./agent.dump'):
        #     with open('./agent.dump') as f:
        #         agent = pickle.load(f)

        print i, 'playing ...'
        score = play(agent)
        score_list.append(score)

        # ここで,
        #   TypeError: only length-1 arrays can be converted to Python scalars
        #   pybrain/rl/learners/valuebased/q.py
        #   => learnerをQからNFQにしたら行けた.
        #   => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work
        print i, 'learning ...'
        agent.learn()
        agent.reset()

        print i, 'evaluate sample ...'
        data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 2]]
        agent.integrateObservation(numpy.array(data).ravel())
        move = agent.getAction()
        print "                           ", i, int(
            numpy.mean(score_list)), max(score_list), move

        if i % 20 == 0:
            print i, 'saving ...'
            with open('./agent.dump', 'w') as f:
                pickle.dump(agent, f)
            with open('./score.dump', 'w') as f:
                pickle.dump(score_list, f)
示例#11
0
    def maze():
        # import sys, time
        pylab.gray()
        pylab.ion()
        # The goal appears to be in the upper right
        structure = [
            '!!!!!!!!!!',
            '! !  ! ! !',
            '! !! ! ! !',
            '!    !   !',
            '! !!!!!! !',
            '! ! !    !',
            '! ! !!!! !',
            '!        !',
            '! !!!!!  !',
            '!   !    !',
            '!!!!!!!!!!',
            ]
        structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure])
        shape = np.array(structure.shape)
        environment = Maze(structure, tuple(shape - 2))
        controller = ActionValueTable(shape.prod(), 4)
        controller.initialize(1.)
        learner = Q()
        agent = LearningAgent(controller, learner)
        task = MDPMazeTask(environment)
        experiment = Experiment(task, agent)

        for i in range(100):
            experiment.doInteractions(100)
            agent.learn()
            agent.reset()
            # 4 actions, 81 locations/states (9x9 grid)
            # max(1) gives/plots the biggest objective function value for that square
            pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9))
            pylab.draw()

        # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
        greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
        greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape))
        maze = np.flipud(np.array(list(' #'))[structure])
        print('Maze map:')
        print('\n'.join(''.join(row) for row in maze))
        print('Greedy policy:')
        print('\n'.join(''.join(row) for row in greedy_policy))
示例#12
0
    def maze():
        # import sys, time
        pylab.gray()
        pylab.ion()
        # The goal appears to be in the upper right
        structure = [
            "!!!!!!!!!!",
            "! !  ! ! !",
            "! !! ! ! !",
            "!    !   !",
            "! !!!!!! !",
            "! ! !    !",
            "! ! !!!! !",
            "!        !",
            "! !!!!!  !",
            "!   !    !",
            "!!!!!!!!!!",
        ]
        structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure])
        shape = np.array(structure.shape)
        environment = Maze(structure, tuple(shape - 2))
        controller = ActionValueTable(shape.prod(), 4)
        controller.initialize(1.0)
        learner = Q()
        agent = LearningAgent(controller, learner)
        task = MDPMazeTask(environment)
        experiment = Experiment(task, agent)

        for i in range(100):
            experiment.doInteractions(100)
            agent.learn()
            agent.reset()
            # 4 actions, 81 locations/states (9x9 grid)
            # max(1) gives/plots the biggest objective function value for that square
            pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9))
            pylab.draw()

        # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
        greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
        greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape))
        maze = np.flipud(np.array(list(" #"))[structure])
        print("Maze map:")
        print("\n".join("".join(row) for row in maze))
        print("Greedy policy:")
        print("\n".join("".join(row) for row in greedy_policy))
示例#13
0
def main():
    rospy.init_node("lauron_reinforcement_learning")
    environment = RLEnvironment()
    dim_state = environment.joint_states.shape[0]
    num_actions = len(environment.actions)
    controller = ActionValueNetwork(dim_state, num_actions)
    learner = SARSA()
    agent = LearningAgent(controller, learner)
    task = RLTask(environment)
    experiment = Experiment(task, agent)

    episode_counter = 0
    while True:
        print("Training episode {}".format(episode_counter))
        experiment.doInteractions(NUM_INTERACTIONS)
        agent.learn()
        agent.reset()
        episode_counter += 1
def test_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = [
        list('!!!!!!!!!!'),
        list('! !  ! ! !'),
        list('! !! ! ! !'),
        list('!    !   !'),
        list('! !!!!!! !'),
        list('! ! !    !'),
        list('! ! !!!! !'),
        list('!        !'),
        list('! !!!!!  !'),
        list('!   !    !'),
        list('!!!!!!!!!!'),
    ]
    structure = np.array([[ord(c) - ord(' ') for c in row]
                          for row in structure])
    shape = np.array(structure.shape)
    environment = Maze(structure, tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
    greedy_policy = np.flipud(
        np.array(list('NESW'))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(' #'))[structure])
    print('Maze map:')
    print('\n'.join(''.join(row) for row in maze))
    print('Greedy policy:')
    print('\n'.join(''.join(row) for row in greedy_policy))
    assert '\n'.join(
        ''.join(row)
        for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def main():

    # 2048の全ての状態を保存するのは無理でしょ.
    #   14^16通りの状態があるよね.
    #controller = ActionValueTable(16, 4)
    #learner = Q()
    #controller.initialize(1.)

    controller = ActionValueNetwork(16, 4)
    learner = NFQ()
    #learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(controller, learner)

    score_list = []
    for i in range(10000):
        # if os.path.exists('./agent.dump'):
        #     with open('./agent.dump') as f:
        #         agent = pickle.load(f)

        print i, 'playing ...'
        score = play(agent)
        score_list.append(score)

        # ここで,
        #   TypeError: only length-1 arrays can be converted to Python scalars
        #   pybrain/rl/learners/valuebased/q.py
        #   => learnerをQからNFQにしたら行けた.
        #   => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work
        print i, 'learning ...'
        agent.learn()
        agent.reset()

        print i, 'evaluate sample ...'
        data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]]
        agent.integrateObservation(numpy.array(data).ravel())
        move = agent.getAction()
        print "                           ",i, int(numpy.mean(score_list)) , max(score_list), move

        if i % 20 == 0:
            print i, 'saving ...'
            with open('./agent.dump', 'w') as f:
                pickle.dump(agent, f)
            with open('./score.dump', 'w') as f:
                pickle.dump(score_list, f)
示例#16
0
def explore_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = [
        list("!!!!!!!!!!"),
        list("! !  ! ! !"),
        list("! !! ! ! !"),
        list("!    !   !"),
        list("! !!!!!! !"),
        list("! ! !    !"),
        list("! ! !!!! !"),
        list("!        !"),
        list("! !!!!!  !"),
        list("!   !    !"),
        list("!!!!!!!!!!"),
    ]
    structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure])
    shape = np.array(structure.shape)
    environment = Maze(structure, tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.0)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
    greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(" #"))[structure])
    print("Maze map:")
    print("\n".join("".join(row) for row in maze))
    print("Greedy policy:")
    print("\n".join("".join(row) for row in greedy_policy))
    assert "\n".join("".join(row) for row in greedy_policy) == "NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN"
def main():
    client_id = Utils.connectToVREP()

    # Define RL elements
    environment = StandingUpEnvironment(client_id)
    task = StandingUpTask(environment)
    controller = MyActionValueTable()
    learner = Q(0.5, 0.9)
    learner.explorer = EpsilonGreedyExplorer(0.15, 1)  # EpsilonGreedyBoltzmannExplorer()
    agent = LearningAgent(controller, learner)
    experiment = EpisodicExperiment(task, agent)

    controller.initialize(agent)

    i = 0
    try:
        while True:
            i += 1
            print('Episode ' + str(i))
            experiment.doEpisodes()
            agent.learn()
            agent.reset()
            print('mean: '+str(numpy.mean(controller.params)))
            print('max: '+str(numpy.max(controller.params)))
            print('min: '+str(numpy.min(controller.params)))

            if i % 500 == 0:  # Save q-table every 500 episodes
                print('Save q-table')
                controller.save()
                task.t_table.save()

    except (KeyboardInterrupt, SystemExit):
        with open('../data/standing-up-q.pkl', 'wb') as handle:
            pickle.dump(controller.params, handle)
        task.t_table.save()
        controller.save()

    vrep.simxFinish(client_id)
示例#18
0
        self.env.reset()

    @property
    def indim(self):
        return self.env.indim

    @property
    def outdim(self):
        return self.env.outdim



env = TetrisEnv(10,20) #Tetris
task = TetrisTask(env)

QNet = ActionValueNetwork(10*20+11, 6);

learner = NFQ(); #Q()?
learner._setExplorer(EpsilonGreedyExplorer(0.2,decay=0.99))

agent = LearningAgent(QNet,learner);

experiment = EpisodicExperiment(task,agent)

while True:
    experiment.doEpisodes(1)
    agent.learn()
    agent.reset() #or call more sporadically...?
    task.reset()

class SimulationMaster:

    def __init__(self, n_threads=4, initial_port=19997, q_table_version=0,
                 batch_size=None, learner=None, explorer=None):
        self.barrier = Barrier(n_threads + 1, timeout=720)
        self.n_threads = n_threads
        self.initial_port = initial_port
        self.batch_size = batch_size

        self.controller = MyActionValueTable(q_table_version)
        if learner is None:
            self.learner = Q(0.5, 0.9)
        else:
            self.learner = learner

        if explorer is None:
            self.explorer = self.learner.explorer = EpsilonGreedyExplorer(0.2, 0.998)
        else:
            self.explorer = self.learner.explorer = explorer
        self.agent = LearningAgent(self.controller, self.learner)
        # Logger initialization
        self.logger = logging.getLogger('master_logger')
        self.logger.setLevel(logging.DEBUG)
        self.logger.addHandler(logging.FileHandler(Utils.DATA_PATH + 'learning-tables/master.log'))
        self.failed_simulations = []
        self.n_episodes = 0
        self.simulations = []
        self.initialize_simulations()

    def initialize_simulations(self):
        self.simulations = []
        for i in range(self.n_threads):
            if self.batch_size is not None:
                self.simulations.append(Simulation(self, self.initial_port + i, self.batch_size))
            else:
                self.simulations.append(Simulation(self, self.initial_port + i))

    def get_action(self, observation):
        action = self.controller.activate(observation)
        action = self.explorer.activate(observation, action)
        return action

    def add_observation(self, obs):
        """
            Adds observation in the agent memory
            :param obs: 3 dimensional vector containing [observation, action, reward]
        """
        self.agent.integrateObservation(obs[0])
        self.agent.lastaction = obs[1]
        self.agent.giveReward(obs[2])

    def update_q_table(self):
        """
            Updates the q table with the new simulators observations
        """
        for sim in self.simulations:
            for trace in sim.traces:
                for obs in trace:
                    self.add_observation(obs)
                self.agent.learn()
                self.agent.reset()
                self.n_episodes += 1

            sim.traces.clear()
        if self.explorer.epsilon > 0.1:
      	    self.explorer.epsilon=self.explorer.epsilon*self.explorer.decay
        if self.learner.alpha > 0.1:
            self.learner.alpha *= 0.999
        self.logger.info('new epsilon: {}'.format(self.explorer.epsilon))
        self.logger.info('new alpha: {}'.format(self.learner.alpha))
        self.logger.info('n episodes: {}'.format(self.n_episodes))

    def save_t_table(self):
        """
            Saves t tables, one for each thread
        """
        for sim in self.simulations:
            sim.save_t_table()

    def run(self):

        self.controller.initialize(self.agent)
        for sim in self.simulations:
            sim.start()
        counter = 0
        while True:
            try:
                self.barrier.wait()  # wait until all simulations are done
                self.update_q_table()
                self.save_t_table()
                self.barrier.wait()  # Free simulations threads and start a new cycle
                # Counter to avoid to save q-table too often
                if counter == 5:
                    self.controller.save()
                    counter = 0
                else:
                    counter += 1
                while self.failed_simulations:
                    sim = self.failed_simulations.pop()
                    self.restart_simulation(sim)
            except BrokenBarrierError as e:
                self.logger.error('Broken Barrier Error Occurred')
                for sim in self.simulations:
                    sim.stop()
                for sim in self.simulations:
                    sim.join()
                del self.simulations
                self.initialize_simulations()
                self.barrier.reset()
                self.failed_simulations.clear()
                for sim in self.simulations:
                    sim.start()

    def restart_simulation(self, simulation):
        self.logger.info('Restarting simulation with port {}'.format(simulation.port))
        self.simulations.remove(simulation)
        new_simulation = Simulation(self, simulation.port)
        self.simulations.append(new_simulation)
        new_simulation.start()
        del simulation
示例#20
0
文件: NUMPY.py 项目: rbobkoskie3/OS
def Py_Brain():
    ############################
    # pybrain
    ############################
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap
    import itertools
    from scipy import linalg

    from pybrain.rl.environments.mazes import Maze, MDPMazeTask
    from pybrain.rl.learners.valuebased import ActionValueTable
    from pybrain.rl.agents import LearningAgent
    from pybrain.rl.learners import Q, SARSA
    from pybrain.rl.experiments import Experiment
    from pybrain.rl.environments import Task

    import pylab
    #pylab.gray()
    #pylab.ion()

    '''
    structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
                          [1, 0, 0, 1, 0, 0, 0, 0, 1],
                          [1, 0, 0, 1, 0, 0, 1, 0, 1],
                          [1, 0, 0, 1, 0, 0, 1, 0, 1],
                          [1, 0, 0, 1, 0, 1, 1, 0, 1],
                          [1, 0, 0, 0, 0, 0, 1, 0, 1],
                          [1, 1, 1, 1, 1, 1, 1, 0, 1],
                          [1, 0, 0, 0, 0, 0, 0, 0, 1],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1]])
    '''
    structure = np.array([[1, 1, 1, 1, 1],
                          [1, 1, 0, 0, 1],
                          [1, 1, 0, 1, 1],
                          [1, 0, 0, 1, 1],
                          [1, 1, 1, 1, 1]])

    num_states = int(structure.shape[0]*structure.shape[1])
    SQRT = int(math.sqrt(num_states))
    #print structure.item((1, 3))
    #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple
    environment = Maze(structure, (1, 3)) #second parameter is goal field tuple
    print type(environment)
    print environment
    # Standard maze environment comes with the following 4 actions:
    # North, South, East, West
    controller = ActionValueTable(num_states, 4) #[N, S, E, W] 
    controller.initialize(1)

    learner = Q()
    agent = LearningAgent(controller, learner)
    np.not_equal(agent.lastobs, None)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    #while True:
    for x in range(4):
        print x
        experiment.doInteractions(10)
        agent.learn()
        agent.reset()

        pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT))
        pylab.draw()
        #pylab.show()
        name='MAZE'
        plt.savefig(str(name)+'_PLOT.png')
    plt.close()
示例#21
0
def rl_optimizer(num_of_actions, params):

    # Defining UI environment
    actionmatrix = range(num_of_actions)
    goal = False # Default goal
    ui_env = UI(params.num_states, actionmatrix, num_of_actions, goal, params.sensor_errors, params.confusion_error, params.penalties, params.grid, params.dimensions, params.init_position, params.goal_position)
    av_table = ActionValueTable(params.num_states, num_of_actions, ui_env)
    av_table.initialize(1)

    # Train agent for each goal
    klm_tot = 0
    klm_avg = 0
    p_learned = 1


    ##############################################
    # Define Q-learning agent
    learner = Q(0.5, 0.99) 
    learner.explorer.epsilon = 0.7 
    learner.explorer.decay = 0.999
    learner.explorer.env = ui_env
    agent = LearningAgent(av_table, learner)

    #Initialize av table. Give action matrix as an input.
    av_table.initialize(-5., actionmatrix) 

    # Define task and experiment
    task = UITask(ui_env)
    experiment = EpisodicExperiment(task, agent, av_table)

    ##############################################
    # Training Agent
    for j in range(8): # Learning iterations

        runs = 50 # Episodes in one iteration
        experiment.doEpisodes(runs) 
        
        agent.learn()
        agent.reset()


    ##############################################
    # Evaluation of UI and policy for current goal
    # Loop to get average : use only if errors used
    klm_tasks_tot = np.array([0.]*(params.num_states-1))
    total_iterations = 1
    klm_tot = 0
    for i in range(total_iterations):
        # KLM value
        klm_g, best_path = evaluation(av_table, ui_env, task, False, params)
        if klm_g == -1: # Not learned
            klm_tot += 20*5
            p_learned = 0
            print "Policy not learned"
            break
        # Save to total KLM
        klm_tot += klm_g
    # Average KLM estimate
    klm_avg += klm_tot/total_iterations

    return best_path, klm_avg
示例#22
0
文件: NFQ.py 项目: vascobailao/PYTHON
    # Without the next line, the pyplot plot won't actually show up.
    plt.pause(0.001)

performance = []

if not render:
    pf_fig = plt.figure()

while(True):
	# one learning step after one episode of world-interaction
    experiment.doEpisodes(1)
    agent.learn(1)

    # test performance (these real-world experiences are not used for training)
    if render:
        env.delay = True
    experiment.agent = testagent
    r = mean([sum(x) for x in experiment.doEpisodes(5)])
    env.delay = False
    testagent.reset()
    experiment.agent = agent

    performance.append(r)
    if not render:
        plotPerformance(performance, pf_fig)

    print("reward avg", r)
    print("explorer epsilon", learner.explorer.epsilon)
    print("num episodes", agent.history.getNumSequences())
    print("update step", len(performance))
示例#23
0
    plt.pause(0.001)


performance = []

if not render:
    pf_fig = plt.figure()

while (True):
    # one learning step after one episode of world-interaction
    experiment.doEpisodes(1)
    agent.learn(1)

    # test performance (these real-world experiences are not used for training)
    if render:
        env.delay = True
    experiment.agent = testagent
    r = mean([sum(x) for x in experiment.doEpisodes(5)])
    env.delay = False
    testagent.reset()
    experiment.agent = agent

    performance.append(r)
    if not render:
        plotPerformance(performance, pf_fig)

    print("reward avg", r)
    print("explorer epsilon", learner.explorer.epsilon)
    print("num episodes", agent.history.getNumSequences())
    print("update step", len(performance))
示例#24
0
from pybrain.rl.experiments import Experiment

envmatrix = array([[1,1,1,1,1,1,1,1,1],
                   [1,0,0,1,0,0,0,0,1],
                   [1,0,0,1,0,0,1,0,1],
                   [1,0,0,1,0,0,1,0,1],
                   [1,0,0,1,0,1,1,0,1],
                   [1,0,0,0,0,0,1,0,1],
                   [1,1,1,1,1,1,1,0,1],
                   [1,0,0,0,0,0,0,0,1],
                   [1,1,1,1,1,1,1,1,1]])
environment = Maze(envmatrix, (7,7))
task = MDPMazeTask(environment)

table = ActionValueTable(81,4)
table.initialize(1.)

agent = LearningAgent(table,Q())

experiment = Experiment(task,agent)

plt.ion()
plt.gray()

for i in range(1000):
    experiment.doInteractions(100);
    agent.learn();
    agent.reset();
    plt.pcolor(table.params.reshape(81,4).max(axis=1).reshape(9,9))
    plt.gcf().canvas.draw()
示例#25
0
# define action-value table
# number of states is:
#
#    current value: 1-21
#
# number of actions:
#
#    Stand=0, Hit=1
av_table = ActionValueTable(21, 2)
av_table.initialize(0.)

# define Q-learning agent
learner = Q(0.5, 0.0)
learner._setExplorer(EpsilonGreedyExplorer(0.0))
agent = LearningAgent(av_table, learner)

# define the environment
env = BlackjackEnv()

# define the task
task = BlackjackTask(env)

# finally, define experiment
experiment = Experiment(task, agent)

# ready to go, start the process
while True:
    experiment.doInteractions(1)
    agent.learn()
    agent.reset()
示例#26
0
        EpisodicTask.reset(self)
        self.env.reset()

    @property
    def indim(self):
        return self.env.indim

    @property
    def outdim(self):
        return self.env.outdim


env = TetrisEnv(10, 20)  #Tetris
task = TetrisTask(env)

QNet = ActionValueNetwork(10 * 20 + 11, 6)

learner = NFQ()
#Q()?
learner._setExplorer(EpsilonGreedyExplorer(0.2, decay=0.99))

agent = LearningAgent(QNet, learner)

experiment = EpisodicExperiment(task, agent)

while True:
    experiment.doEpisodes(1)
    agent.learn()
    agent.reset()  #or call more sporadically...?
    task.reset()
示例#27
0
def rl_optimizer(UImatrix, actionmatrix, actions_in_uis, actions_penalty,
                 num_of_actions, params):
    policies = [([])] * params.num_states

    # Defining UI environment
    goal = 1  # Default goal
    ui_env = UI(UImatrix, actionmatrix, actions_in_uis, actions_penalty, goal,
                params)
    av_table = ActionValueTable(params.num_states, num_of_actions)

    klm_tot = 0
    klm_avg = 0
    p_learned = 1
    modality_table_total = np.array([0, 0, 0])
    klm_total = 0

    # Train agent for each goal
    for g in range(0, ui_env.num_of_states):

        ##############################################
        # Define Q-learning agent
        learner = Q(0.5, 0.9)
        learner.explorer.epsilon = 0.7
        learner.explorer.decay = 0.999
        learner.explorer.env = ui_env
        agent = LearningAgent(av_table, learner)

        # Define task and experiment
        task = UITask(ui_env)
        experiment = EpisodicExperiment(task, agent)

        # Initialze av table. Removes not allowed actions.
        av_table.initialize(1., actionmatrix)

        # Set goal
        experiment.task.env.setGoal(g)

        for j in range(50):

            initial_state = mod(j, ui_env.num_of_states)
            if initial_state == g: continue
            experiment.task.env.setInitialState(initial_state)

            runs = 50
            experiment.doEpisodes(runs)

            agent.learn()
            agent.reset()

        ##############################################
        # Evaluation of UI and policy for the current goal
        # Iterate to get average - use only if errors used
        total_iterations = 10
        klm_tot = 0
        for i in range(total_iterations):
            # KLM value
            klm_g, modality_table = evaluation(av_table, ui_env, g, params)

            # Not learned
            if klm_g == -1:
                klm_tot += 20 * 5
                p_learned = 0
                return -1, 0, 0, 0, 0

            # Save to total KLM
            klm_tot += klm_g / (params.num_states - 1)

        klm_avg += klm_tot / total_iterations

        modality_table_total += np.array(modality_table)
        klm_total += klm_avg

        if p_learned == 0: break

    return modality_table_total, klm_total
示例#28
0
文件: trainnet.py 项目: nairboon/bnrl
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", task,parameters
    
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed)
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    env.randomInitialization = False
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, 50)

    #print "dim: ", task.indim, task.outdim
    
    # to inputs state and 4 actions
    bmodule = ActionValueRAND(task.outdim, task.indim)
    rlearner = RAND()

    blearner = RAND()
    # % of random actions
    
    bagent = LearningAgent(bmodule, rlearner)
    
    from pybrain.tools.shortcuts import buildNetwork
    from pybrain.rl.agents import OptimizationAgent
    from pybrain.optimization import PGPE

    module = buildNetwork(task.outdim, task.indim, bias=False)
    # create agent with controller and learner (and its options)

    # % of random actions
    #learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=True, maxEvaluations=None, verbose=False))


    
    
    testagent = LearningAgent(module, None)
    pgpeexperiment = EpisodicExperiment(task, agent)
    randexperiment = EpisodicExperiment(task, bagent)


    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    
    ## train pgpe
    for episode in range(0,50):
    	# one learning step after one episode of world-interaction
        y =pgpeexperiment.doEpisodes(1)
        
    be, bf = agent.learner._bestFound()
    print be,bf
    
    print "generate data"
    be.numActions = 1
    gdagent = LearningAgent(be, blearner)
    experiment = EpisodicExperiment(task, gdagent)
    
    for episode in range(0,1000):
#        print episode, " of 1000"
    	# one learning step after one episode of world-interaction
        y =experiment.doEpisodes(1)
        
#        print y
        x = randexperiment.doEpisodes(1)
#        print len(y[0])
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        

        l = 5
        resList = (agent.learner._allEvaluations)[-l:-1]
        
#            print agent.learner._allEvaluations
        from scipy import array

        rLen = len(resList)
        avReward = array(resList).sum()/rLen
#            print avReward
#            print resList
#            exit(0)
#            print("Parameters:", agent.learner._bestFound())
#            print(
#                " Evaluation:", episode,
#                " BestReward:", agent.learner.bestEvaluation,
#                " AverageReward:", avReward)
#            if agent.learner.bestEvaluation == 0:
#                
#                print resList[-20:-1]
#                print "done"
#                break
        #print resList
        performance.append(avReward)
        

        env.delay = False
        testagent.reset()
        #experiment.agent = agent
    
#            performance.append(r)
        if plot:
            plotPerformance(performance, pf_fig)
            
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
    blearner.add_ds(rlearner.dataset)
    
    blearner.learn()
    #blearner.learnX(agent.learner._allEvaluated)
    print "done"
    return performance
示例#29
0
文件: PGPE.py 项目: nairboon/bnrl
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", parameters
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed + process_id)


    
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, parameters["MaxRunsPerEpisode"])
    
    testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"],desiredValue=None)

    #print "dim: ", task.indim, task.outdim

    from pybrain.tools.shortcuts import buildNetwork
    from pybrain.rl.agents import OptimizationAgent
    from pybrain.optimization import PGPE

    module = buildNetwork(task.outdim, task.indim, bias=False)
    # create agent with controller and learner (and its options)

    # % of random actions
    #learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=False, maxEvaluations=None,desiredEvaluation=1, verbose=False))
#
#    print agent
#    from pprint import pprint
#    pprint (vars(agent.learner))
    
    testagent = LearningAgent(module, None)
    experiment = EpisodicExperiment(task, agent)
    testexperiment = EpisodicExperiment(testtask, testagent)

    
    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    for episode in range(0,m):
    	# one learning step after one episode of world-interaction
        experiment.doEpisodes(parameters["EpisodesPerLearn"])
        #agent.learn(1)
    
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        
        if (episode) % parameters["TestAfter"] == 0:
            #print "Evaluating at episode: ", episode
            
            #experiment.agent = testagent
            #r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])])
            #for i in range(0,parameters["TestWith"]):
#            y = testexperiment.doEpisodes(1)
#            print (agent.learner._allEvaluated)
#                
#            
#            from pprint import pprint
#            pprint (vars(task))
                
            l = parameters["TestWith"]
            
            task.N = parameters["MaxRunsPerEpisodeTest"]
            experiment.doEpisodes(l)
            task.N = parameters["MaxRunsPerEpisode"]

            resList = (agent.learner._allEvaluations)[-l:-1]
            
#            print agent.learner._allEvaluations
            from scipy import array

            rLen = len(resList)
            avReward = array(resList).sum()/rLen
#            print avReward
#            print resList
#            exit(0)
#            print("Parameters:", agent.learner._bestFound())
#            print(
#                " Evaluation:", episode,
#                " BestReward:", agent.learner.bestEvaluation,
#                " AverageReward:", avReward)
#            if agent.learner.bestEvaluation == 0:
#                
#                print resList[-20:-1]
#                print "done"
#                break
            performance.append(avReward)
            

            env.delay = False
            testagent.reset()
            #experiment.agent = agent
        
#            performance.append(r)
            if plot:
                plotPerformance(performance, pf_fig)
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
#    print "done"
    return performance
            
        #print "network",   json.dumps(module.bn.net.E, indent=2)
            
            
#import sumatra.parameters as p
#import sys
#parameter_file = sys.argv[1]
#parameters = p.SimpleParameterSet(parameter_file)
#
#
#run(["BalanceTask",parameters])
示例#30
0
def rl_optimizer(UImatrix, buttonmatrix, num_of_actions, top_UI, params,
                 batch_num, logging):

    # Defining UI environment
    goal = 1  # Default goal
    ui_env = UI(UImatrix, buttonmatrix, goal, params.sensor_errors,
                params.confusion_error, params.penalties)
    av_table = ActionValueTable(params.num_states, num_of_actions)

    # Check which actions are allowed
    actions_index = [[]] * params.num_states
    bad_actions_idx = []

    for j in range(0, params.num_states):
        for i in range(0, params.num_states):
            if i == j: continue
            if UImatrix[j][i] == 1:
                actions_index[j] = actions_index[j] + [1, 1, 1]
                bad_actions_idx = bad_actions_idx + [0, 0, 0]
            else:
                actions_index[j] = actions_index[j] + [0, 0, 0]
                bad_actions_idx = bad_actions_idx + [1, 1, 1]

    ui_env.actions_index = actions_index
    bad_actions = np.nonzero(np.array(bad_actions_idx))

    # Train agent for each goal
    klm_tot = 0
    klm_avg = 0
    policies = []
    objective = -1
    p_learned = 1
    ii = 0
    for g in range(0, ui_env.num_of_states):

        ##########################
        # Define Q-learning agent
        ##########################
        learner = Q(0.5, 0.9)
        learner.explorer.epsilon = 0.3
        learner.explorer.decay = 0.999
        learner.explorer.actions_index = actions_index
        learner.explorer.env = ui_env
        agent = LearningAgent(av_table, learner)

        ##########################
        # Define task and experiment
        ##########################
        task = UITask(ui_env)
        experiment = EpisodicExperiment(task, agent)

        # Set low values for not allowed actions
        bad_actions = np.ones([ui_env.num_of_states, ui_env.num_of_states])
        for idx_state in range(ui_env.num_of_states):
            for idx_button in range(len(buttonmatrix[idx_state])):
                bad_actions[idx_state, buttonmatrix[idx_state]] = 0
        bad_actions = np.reshape(bad_actions,
                                 ui_env.num_of_states * ui_env.num_of_states)
        bad_actions = np.nonzero(bad_actions)

        av_table.initialize(1., bad_actions[0])  #Removed bad actions

        # Initialize saved N av_tables
        convergence_N = 1  # Move to params
        av_tables_save = [] * convergence_N

        # Set goal
        experiment.task.env.setGoal(g)

        ##########################
        # Trainin agent
        ##########################
        # Add more iterations and runs if not learning.
        for j in range(10):

            initial_state = mod(j, ui_env.num_of_states)
            if initial_state == g: continue
            experiment.task.env.setInitialState(initial_state)

            runs = 15
            experiment.doEpisodes(runs)

            agent.learn()
            agent.reset()

        ##########################
        # Save policy
        ##########################
        p = list(av_table.params)  # Copies to new memory slot
        policies.append(p)

        ##############################################
        # Evaluation of UI and policy for current goal
        ##############################################
        # Loop to get average : use only if errors used
        klm_tasks_tot = np.array([0.] * (params.num_states - 1))
        total_iterations = 15
        klm_tot = 0
        for i in range(total_iterations):
            # KLM value
            klm_g = evaluation(av_table, ui_env, g, params, batch_num, logging)
            if klm_g == -1:  # Not learned
                klm_tot += 20 * 5
                p_learned = 0
                break
            # Save to total KLM
            klm_tot += klm_g / (params.num_states - 1)
        klm_avg += params.state_probs[g] * klm_tot / total_iterations

        if p_learned == 0: break

    if p_learned == 1:  # Policy learned
        ##########################
        # Consistency
        ##########################
        consistency = 0
        idx = 0
        transitions_state = np.sum(UImatrix, 0)
        buttons_to_states = np.zeros(
            [params.num_states,
             params.num_states])  # Which buttons reach to the state goal
        for sr in range(params.num_states):
            for sc in range(params.num_states):
                if UImatrix[sr, sc] == 1:
                    buttons_to_states[sc][buttonmatrix[sr][idx]] += 1
                    idx = idx + 1
            idx = 0
        for s in range(params.num_states):
            for act in range(params.num_states):
                if buttons_to_states[s][act] > 0:
                    consistency += math.log(buttons_to_states[s][act] /
                                            transitions_state[s])

        ##########################
        # Objective function
        ##########################
        objective = params.w_klm * klm_avg
        objective = objective - 1 * params.w_const * consistency
        objective = objective + params.w_simpl * math.log(
            np.sum(np.sum(UImatrix, 1)))
        objective_func = [
            klm_avg, consistency,
            math.log(np.sum(np.sum(UImatrix, 1)))
        ]

        ##########################
        # Save the best
        ##########################
        top_UI.append([
            UImatrix, buttonmatrix, policies, objective, objective_func,
            klm_avg
        ])
        if len(top_UI) > params.top:
            top_UI = sorted(top_UI, key=op.itemgetter(3))[:params.top]

    return top_UI, objective
示例#31
0
weeks = 52 * 2
days = 5  # number of samples per gradient estimate
for week in range(weeks):
    all_rewards = experiment.doEpisodes(number=days)
    tot_reward = numpy.mean(agent.history.getSumOverSequences('reward'))

    #    print learner._allEvaluations#[-:-1]

    # Plot the reward at each period averaged over the week.
    r = -1.0 * numpy.array(all_rewards).reshape(days, nf)
    avg_r = numpy.mean(r, 0)
    plot.setData(5, rday, avg_r)

    # Plot the set-point of each generator on the last day of the week.
    # FIXME: Plot the set-points averaged over the week.
    for i in range(len(case.online_generators)):
        scale_factor = 10
        #        plot.setData(i, rday, env._Pg[i, :] * scale_factor)
        plot.setData(i, rday, experiment.Pg[i, :] * scale_factor)

    agent.learn()
    agent.reset()

    # Scale sigma manually.
    sigma = [(sig * 0.95) - 0.05 for sig in sigma]
    learner.explorer.sigma = sigma

    plot.update()

pylab.savefig("/tmp/rlopf.png")
示例#32
0
文件: robin.py 项目: Waqquas/pylon
def roundrobin(case, learners, profile, m, nb, ns, mx, weeks, days,
               outdir="/tmp", dc=True, trial=0):
    np = len(profile)

    adj = "dc" if dc else "ac"
    market = SmartMarket(case, priceCap=100.0, decommit=True,
                         locationalAdjustment=adj)

    for i, perms in enumerate(itertools.permutations(learners)):
        experiment = MarketExperiment([], [], market, profile)

        for j, learner in enumerate(perms):
            gens = case.generators[j:j + 1]

            if isinstance(learner, ValueBasedLearner):
                # Comment out for stateful Roth-Erev learner.
                nstates = 1 if isinstance(learner, RothErev) else ns

                env = discrete.MarketEnvironment(gens, market,
                                                 markups=m,
                                                 numStates=nstates,
                                                 numOffbids=nb)
                task = discrete.ProfitTask(env, maxSteps=np)

                na = len(env._allActions)
                module = ActionValueTable(numStates=nstates, numActions=na)

            elif isinstance(learner, DirectSearchLearner):
                env = continuous.MarketEnvironment(gens, market, nb, mx)

                task = continuous.ProfitTask(env, maxSteps=np)

                module = buildNetwork(env.outdim, 2, env.indim,
                                      bias=True, outputbias=True,
                                      hiddenclass=TanhLayer,outclass=TanhLayer)
            else:
                raise ValueError

            agent = LearningAgent(module, learner)
            experiment.tasks.append(task)
            experiment.agents.append(agent)

        all_states = zeros((3, 0))
        all_actions = zeros((3, 0))
        all_rewards = zeros((3, 0))
        comments = ["Trial: %d, Perm: %d" % (trial, i)]
        for task, agent in zip(experiment.tasks, experiment.agents):
            g = task.env.generators[0]
            l = agent.learner.__class__.__name__
            comments.append("(%s, %s)" % (g.name, l))
        c = ", ".join(comments)

        for _ in range(weeks):
            experiment.doEpisodes(days)

            states = zeros((0, days * np))
            actions = zeros((0, days * np))
            rewards = zeros((0, days * np))
            for _, agent in enumerate(experiment.agents):
                states = r_[states, agent.history["state"].T]
                actions = r_[actions, agent.history["action"].T]
                rewards = r_[rewards, agent.history["reward"].T]

                agent.learn()
                agent.reset()

            all_states = c_[all_states, states]
            all_actions = c_[all_actions, actions]
            all_rewards = c_[all_rewards, rewards]

        mmwrite(join(outdir, "state_%d_%d.mtx" % (trial, i)), all_states, c)
        mmwrite(join(outdir, "action_%d_%d.mtx" % (trial, i)), all_actions, c)
        mmwrite(join(outdir, "reward_%d_%d.mtx" % (trial, i)), all_rewards, c)
示例#33
0
文件: NFQ.py 项目: nairboon/bnrl
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", parameters
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed + process_id)


    
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, parameters["MaxRunsPerEpisode"])
    testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"])

    #print "dim: ", task.indim, task.outdim
    
    # to inputs state and 4 actions
    module = ActionValueNetwork(task.outdim, task.indim)
    

    learner = NFQ()
    # % of random actions
    learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = LearningAgent(module, learner)
    testagent = LearningAgent(module, None)
    experiment = EpisodicExperiment(task, agent)
    testexperiment = EpisodicExperiment(testtask, testagent)

    
    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    for episode in range(0,m):
    	# one learning step after one episode of world-interaction
        experiment.doEpisodes(parameters["EpisodesPerLearn"])
        agent.learn(1)
    
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        
        if (episode) % parameters["TestAfter"] == 0:
            #print "Evaluating at episode: ", episode
            
            #experiment.agent = testagent
            r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])])
            
            env.delay = False
            testagent.reset()
            #experiment.agent = agent
        
            performance.append(r)
            if plot:
                plotPerformance(performance, pf_fig)
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
#    print "done"
    return performance
            
        #print "network",   json.dumps(module.bn.net.E, indent=2)
示例#34
0
def rl_optimizer(UImatrix, num_of_actions, top_UI, params, batch_num, logging):

    #global policies
    #global top_UI
    policies = [([])] * params.num_states
    num_states = params.num_states

    # Defining UI environment
    actionmatrix = range(num_of_actions)
    goal = False  # Default goal
    ui_env = UI(num_states, actionmatrix, num_of_actions, goal,
                params.sensor_errors, params.confusion_error, params.penalties,
                params.grid, params.dimensions, params.init_position,
                params.goal_position)
    av_table = ActionValueTable(num_states, num_of_actions, ui_env)
    av_table.initialize(1)

    # Train agent for each goal
    klm_tot = 0
    klm_avg = 0
    policies = []
    best_actions = []  # [0]*ui_env.num_of_states*(ui_env.num_of_states-1)
    objective = -1
    p_learned = 1
    ii = 0

    #########
    # Define Q-learning agent
    learner = Q(0.5, 0.99)  #Q(0.6, 0.99) # 0.5, 0.99
    learner.explorer.epsilon = 0.7  # 0.7 # 0.9
    learner.explorer.decay = 0.999  # 0.99
    learner.explorer.env = ui_env
    agent = LearningAgent(av_table, learner)

    # Define task and experiment
    task = UITask(ui_env)
    experiment = EpisodicExperiment(task, agent, av_table)
    #######

    #Removed bad actions, give action matric as input
    av_table.initialize(-5., actionmatrix)

    for j in range(8):  # Learning iterations

        initial_state = 0

        runs = 50  # Episodes in one iteration
        experiment.doEpisodes(runs)

        agent.learn()
        agent.reset()

        ##############################################
        # Save policy
        # For optimization
        p = list(av_table.params)  # Copies to new memory slot
        policies.append(p)

    ##############################################
    # Evaluation of UI and policy for current goal
    # Loop to get average : use only if errors used

    klm_tasks_tot = np.array([0.] * (params.num_states - 1))
    total_iterations = 1
    klm_tot = 0
    for i in range(total_iterations):
        # KLM value
        klm_g, best_path = evaluation(av_table, ui_env, task, False, params,
                                      batch_num, logging)
        if klm_g == -1:  # Not learned
            klm_tot += 20 * 5
            p_learned = 0
            print "error"
            break
        # Save to total KLM
        klm_tot += klm_g
    klm_avg += klm_tot / total_iterations

    return top_UI, objective, best_actions, best_path, klm_g