示例#1
0
class Player:
    def __init__(self, name, isBot):
        self.name = name
        self.isBot = isBot
        if not self.isBot:
            self.chosenAction = 0
            self.defineKeyboardListener()

        self.initializeProperties()
        self.QNetwork = DQN("QN{}".format(name), self.miniBatchSize)
        self.TDTarget = DQN("TD{}".format(name), self.miniBatchSize)
        self.sess = tf.Session()
        self.QNetwork.setSess(self.sess)
        self.TDTarget.setSess(self.sess)
        self.sess.run(tf.global_variables_initializer())
        self.synchronise()

    def initializeProperties(self):
        self.synchronisationPeriod = 100
        self.explorationRate = 0.999

        # Behaviour when playing & training
        self.trainable = True
        self.exploiting = False

        # Statistics
        self.gamesWon = 0
        self.gamesLost = 0

        # Training
        self.trainingData = []
        self.maxBatchSize = 50000
        # trainingData will not have more than maxBatchSize elements
        self.miniBatchSize = 32
        self.miniBatch = []
        self.startTraining = 1000
        # the training will happen iff we have more than startTraining data in trainingData

        print("Properties initialized")

    def defineKeyboardListener(self):
        def on_press(key):
            try:
                if key == Key.up:
                    self.chosenAction = 1
                elif key == Key.down:
                    self.chosenAction = 2
                else:
                    self.chosenAction = 0
            except AttributeError:
                self.chosenAction = 0

        def on_release(key):
            self.chosenAction = 0
            if key == keyboard.Key.esc:
                # Stop listener
                return False

        self.listener = keyboard.Listener(on_press=on_press,
                                          on_release=on_release)
        self.listener.start()

    def training(self, step):
        if not self.trainable or len(self.trainingData) < self.startTraining:
            return
        if step % self.synchronisationPeriod == 0:
            self.synchronise()
        self.miniBatch = random.sample(self.trainingData, self.miniBatchSize)
        states, actions, rewards, nextStates = zip(*self.miniBatch)
        output = self.TDTarget.computeTarget(nextStates, rewards)
        self.QNetwork.training(states, output, actions)

    def play(self):
        if self.isBot:
            if self.exploiting or random.random() > self.explorationRate:
                return self.QNetwork.evaluate(self.buffer)
            else:
                return random.randint(0, 1)
        else:
            return self.chosenAction

    def updateConstants(self, learningRate=None, explorationRate=None):
        self.QNetwork.updateConstants(learningRate)
        if not isinstance(explorationRate, type(None)):
            self.explorationRate = explorationRate

    def resetStats(self):
        self.gamesWon = 0
        self.gamesLost = 0

    def updateStats(self, reward):
        if reward == 1:
            self.gamesWon += 1
        elif reward == -1:
            self.gamesLost += 1

    def displayStats(self):
        # print("{} victories & {} defeats".format(self.gamesWon, self.gamesLost))
        print(self.gamesWon, self.gamesLost)

    def addStateSequence(self, action, reward, nextState):
        if self.trainable:
            self.trainingData.append([self.buffer, action, reward, nextState])
            while len(self.trainingData) > self.maxBatchSize:
                self.trainingData.pop(0)
        self.buffer = nextState

    def saveQNetwork(self, path, global_step=None):
        self.QNetwork.saveQNetwork(path, global_step)

    def restoreQNetwork(self, path, global_step=None):
        self.QNetwork.restoreQNetwork(path, global_step)

    def setBehaviour(self, isTraining):
        self.trainable = isTraining
        self.exploiting = not isTraining

    def synchronise(self):
        e1_params = [
            t for t in tf.trainable_variables()
            if t.name.startswith(self.QNetwork.scope)
        ]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [
            t for t in tf.trainable_variables()
            if t.name.startswith(self.TDTarget.scope)
        ]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            update_ops.append(op)
        self.sess.run(update_ops)
class Player :

    def __init__(self, name) :
        self.name = name

        self.initializeProperties()
        self.QNetwork = DQN(self.imageSize, "QN", self.miniBatchSize)
        self.TDTarget = DQN(self.imageSize, "TD", self.miniBatchSize)
        self.sess = tf.Session()
        self.QNetwork.setSess(self.sess)
        self.TDTarget.setSess(self.sess)
        self.sess.run(tf.global_variables_initializer())
        self.synchronise()

    def initializeProperties(self) :
        # Q Network Constants
        self.imageSize = 80
        self.synchronisationPeriod = 500

        # Constants
        self.explorationRate = 0.999

        # Behaviour when playing & training
        self.trainable = True
        self.exploiting = False

        # Statistics
        self.score = 0

        # Training
        self.trainingData = []
        self.maxBatchSize = 10000
        # trainingData will not have more than maxBatchSize elements
        self.miniBatchSize = 32
        self.miniBatch = []
        self.startTraining = 1000
        # the training will happen iff we have more than startTraining data in trainingData

        print("Properties initialized")

    def training(self, step) :
        if not self.trainable or len(self.trainingData) < self.startTraining:
            return
        if step % self.synchronisationPeriod == 0 :
            self.synchronise()
        self.miniBatch = random.sample(self.trainingData, self.miniBatchSize)
        states, actions, rewards, nextStates = zip(*self.miniBatch)
        output = self.TDTarget.computeTarget(nextStates, rewards)
        self.QNetwork.training(states, output, actions)

    def play(self) :
        if self.exploiting or random.random() > self.explorationRate :
            return self.QNetwork.evaluate(self.buffer)
        else :
            return int(random.random() < 0.9)

    def updateConstants(self, learningRate = None, explorationRate = None) :
        self.QNetwork.updateConstants(learningRate)
        if not isinstance(explorationRate, type(None)) :
            self.explorationRate = explorationRate

    def resetStats(self) :
        self.score = 0

    def updateStats(self, reward) :
        if reward == 1 :
            self.score += 1

    def displayStats(self) :
        # print("{} victories & {} defeats".format(self.gamesWon, self.gamesLost))
        print(self.score)

    def addStateSequence(self, action, reward, nS) :
        # nS = np.transpose(nS, [1, 2, 0])
        if self.trainable :
            self.trainingData.append([self.buffer, action, reward, nS])
            while len(self.trainingData) > self.maxBatchSize :
                del self.trainingData[0]
        self.buffer = nS

    def saveQNetwork(self, path, global_step = None) :
        self.QNetwork.saveQNetwork(path, global_step)

    def restoreQNetwork(self, path, global_step = None):
        self.QNetwork.restoreQNetwork(path, global_step)

    def setBehaviour(self, isTraining) :
        self.trainable = isTraining
        self.exploiting = not isTraining

    def synchronise(self):
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(self.QNetwork.scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(self.TDTarget.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            update_ops.append(op)
        self.sess.run(update_ops)