Пример #1
0
    def __init__(self):
        self.param = param
        self.env = Pairwise(train_env, 2, Z=self.param.Z)
        self.LEARNING_RATE = 0.000001  #For win_size 30 to 100 : 1e-3 to 1e-5, win_size 200 : 1e-3 to 1e-5, win_size 500 : 1e-3 to 1e-5, win_Size 1000 : 1e-4 to 1e-5

        tf.reset_default_graph()
        """ Define Deep reinforcement learning network """
        if FLAGS.model_name == "DQN":
            self.mainQN = Qnetwork(self.param.h_size, self.env,
                                   self.LEARNING_RATE, self.param.n_step)
            self.targetQN = Qnetwork(self.param.h_size, self.env,
                                     self.LEARNING_RATE, self.param.n_step)
            self.trainables = tf.trainable_variables()
            self.targetOps = updateTargetGraph(self.trainables, self.param.tau)
        elif FLAGS.model_name == "SSD":
            self.mainQN = SSDnetwork(self.param.h_size, self.env, "main",
                                     self.LEARNING_RATE, self.param.n_step)
            self.targetQN = SSDnetwork(self.param.h_size, self.env, "target",
                                       self.LEARNING_RATE, self.param.n_step)
            self.trainables = tf.trainable_variables()
            self.targetOps = updateTargetGraph(self.trainables, self.param.tau)
        elif FLAGS.model_name == "DiffSSD":
            self.mainQN = DiffSSDnetwork(self.param.h_size, self.env, "main",
                                         self.LEARNING_RATE, self.param.n_step)
            self.targetQN = DiffSSDnetwork(self.param.h_size, self.env,
                                           "target", self.LEARNING_RATE,
                                           self.param.n_step)
            self.trainables = tf.trainable_variables()
            self.targetOps = updateTargetGraph(self.trainables, self.param.tau)

        self.init = tf.global_variables_initializer()
        self.saver = tf.train.Saver()
Пример #2
0
    def __init__(self):
        self.env = Pairwise(game_env(),2,Z=param.Z)

        tf.reset_default_graph()
        
        """ Define Deep reinforcement learning network """
        self.mainQN = DiffSSDnetwork(param.h_size,self.env,"main",param.beta,param.n_step)
        self.targetQN = DiffSSDnetwork(param.h_size,self.env,"target",param.alpha,param.n_step)
        self.trainables = tf.trainable_variables()
        self.copyOps = copyGraphOp(self.trainables)

        self.init = tf.global_variables_initializer()
        self.saver = tf.train.Saver()
Пример #3
0
    def __init__(self):        
        self.param = import_module('DQNalign.param.'+FLAGS.network_set)
        self.env = Pairwise(train_env.reward,train_env.l_seq,train_env.win_size,train_env.p,train_env.maxI,-1)

        tf.reset_default_graph()
        
        """ Define Deep reinforcement learning network """
        if FLAGS.model_name == "DQN":
            self.mainQN = Qnetwork(self.param.h_size,self.env)
            self.targetQN = Qnetwork(self.param.h_size,self.env)
            self.trainables = tf.trainable_variables()
            self.targetOps = updateTargetGraph(self.trainables, self.param.tau)
        elif FLAGS.model_name == "SSD":
            self.mainQN = SSDnetwork(self.param.h_size,self.env,"main")
            self.targetQN = SSDnetwork(self.param.h_size,self.env,"target")
            self.trainables = tf.trainable_variables()
            self.targetOps = updateTargetGraph(self.trainables, self.param.tau)

        self.init = tf.global_variables_initializer()
        self.saver = tf.train.Saver()
Пример #4
0
    def __init__(self,
                 FLAGS,
                 istrain,
                 game_env,
                 model,
                 seq1=[],
                 seq2=[],
                 ismeta=False):
        """ Get parameters from files """
        self.FLAGS = FLAGS
        self.istrain = istrain

        if ismeta:
            self.param = import_module('DQNalign.param.MAML')
        else:
            self.param = import_module('DQNalign.param.' +
                                       self.FLAGS.network_set)
            """ Exploration strategy """
            if self.istrain:
                self.l_seq = game_env.l_seq
                self.e = self.param.startE
                self.stepDrop = (self.param.startE -
                                 self.param.endE) / self.param.annealing_steps
        """ Define sequence alignment environment """
        if self.istrain:
            self.env = Pairwise(game_env, 0, Z=self.param.Z)
        else:
            if len(seq1) + len(seq2) > 0:
                self.env = Pairwise(game_env, 1, seq1, seq2, Z=self.param.Z)
            else:
                self.env = Pairwise(game_env, 0, Z=self.param.Z)

        if ismeta:
            self.mainQN = model.mainQN
            self.tempQN = model.targetQN
            self.trainables = model.trainables
            self.copyOps = model.copyOps
        elif (self.FLAGS.model_name
              == "DQN") or (self.FLAGS.model_name == "SSD") or (
                  self.FLAGS.model_name == "DiffSSD") or (self.FLAGS.model_name
                                                          == "FFTDQN"):
            self.mainQN = model.mainQN
            self.targetQN = model.targetQN
            self.trainables = model.trainables
            self.targetOps = model.targetOps
        """ Initialize the variables """
        self.total_steps = 0
        self.start = time.time()
        self.myBuffer = experience_buffer()
Пример #5
0
    def __init__(self):
        self.param = import_module('DQNalign.param.' + FLAGS.network_set)
        self.env = Pairwise(train_env, -1, Z=self.param.Z)
        self.LEARNING_RATE = 0.0000001

        tf.reset_default_graph()
        """ Define Deep reinforcement learning network """
        if FLAGS.model_name == "DQN":
            self.mainQN = Qnetwork(self.param.h_size, self.env,
                                   self.LEARNING_RATE, self.param.n_step)
            self.targetQN = Qnetwork(self.param.h_size, self.env,
                                     self.LEARNING_RATE, self.param.n_step)
            self.trainables = tf.trainable_variables()
            self.targetOps = updateTargetGraph(self.trainables, self.param.tau)
        elif FLAGS.model_name == "SSD":
            self.mainQN = SSDnetwork(self.param.h_size, self.env, "main",
                                     self.LEARNING_RATE, self.param.n_step)
            self.targetQN = SSDnetwork(self.param.h_size, self.env, "target",
                                       self.LEARNING_RATE, self.param.n_step)
            self.trainables = tf.trainable_variables()
            self.targetOps = updateTargetGraph(self.trainables, self.param.tau)

        self.init = tf.global_variables_initializer()
        self.saver = tf.train.Saver()
Пример #6
0
class Agent():
    def __init__(self,
                 FLAGS,
                 istrain,
                 game_env,
                 model,
                 seq1=[],
                 seq2=[],
                 ismeta=False):
        """ Get parameters from files """
        self.FLAGS = FLAGS
        self.istrain = istrain

        if ismeta:
            self.param = import_module('DQNalign.param.MAML')
        else:
            self.param = import_module('DQNalign.param.' +
                                       self.FLAGS.network_set)
            """ Exploration strategy """
            if self.istrain:
                self.l_seq = game_env.l_seq
                self.e = self.param.startE
                self.stepDrop = (self.param.startE -
                                 self.param.endE) / self.param.annealing_steps
        """ Define sequence alignment environment """
        if self.istrain:
            self.env = Pairwise(game_env, 0, Z=self.param.Z)
        else:
            if len(seq1) + len(seq2) > 0:
                self.env = Pairwise(game_env, 1, seq1, seq2, Z=self.param.Z)
            else:
                self.env = Pairwise(game_env, 0, Z=self.param.Z)

        if ismeta:
            self.mainQN = model.mainQN
            self.tempQN = model.targetQN
            self.trainables = model.trainables
            self.copyOps = model.copyOps
        elif (self.FLAGS.model_name
              == "DQN") or (self.FLAGS.model_name == "SSD") or (
                  self.FLAGS.model_name == "DiffSSD") or (self.FLAGS.model_name
                                                          == "FFTDQN"):
            self.mainQN = model.mainQN
            self.targetQN = model.targetQN
            self.trainables = model.trainables
            self.targetOps = model.targetOps
        """ Initialize the variables """
        self.total_steps = 0
        self.start = time.time()
        self.myBuffer = experience_buffer()

    def reset(self):
        """ Define sequence alignment environment """
        self.istrain = True
        self.env.sizeS1 = self.l_seq[0]
        self.env.sizeS2 = self.l_seq[1]

    def set(self, seq1=[], seq2=[]):
        """ Define sequence alignment environment """
        self.istrain = False
        self.env.test(seq1, seq2)

    def train(self, sess):
        trainBatch = self.myBuffer.sample(
            self.param.batch_size
        )  # Select the batch from the experience buffer
        #print(np.shape(np.vstack(trainBatch[:, 3])))

        if (self.FLAGS.model_name
                == "DQN") or (self.FLAGS.model_name == "SSD") or (
                    self.FLAGS.model_name
                    == "DiffSSD") or (self.FLAGS.model_name == "FFTDQN"):
            # The estimated Q value from main network is Q1, from target network is Q2
            Q1 = sess.run(self.mainQN.predict,
                          feed_dict={
                              self.mainQN.scalarInput: np.vstack(trainBatch[:,
                                                                            3])
                          })
            Q2 = sess.run(self.targetQN.Qout,
                          feed_dict={
                              self.targetQN.scalarInput:
                              np.vstack(trainBatch[:, 3])
                          })

            # trainBatch[:,4] means that the action was the last step of the episode
            # If the action is the last step, the reward is used for update Q value
            # IF not, the Q value is updated as follows:
            # Qmain(s,a) = r(s,a) + yQtarget(s1,argmaxQmain(s1,a))
            end_multiplier = -(trainBatch[:, 4] - 1)
            doubleQ = Q2[range(self.param.batch_size), Q1]
            targetQ = trainBatch[:,
                                 2] + (self.param.y * doubleQ * end_multiplier)
            _, loss = sess.run(
                [self.mainQN.updateModel, self.mainQN.loss],
                feed_dict={
                    self.mainQN.scalarInput: np.vstack(trainBatch[:, 0]),
                    self.mainQN.targetQ: targetQ,
                    self.mainQN.actions: trainBatch[:, 1]
                })
            updateTarget(self.targetOps,
                         sess)  # Update target network with 'tau' ratio

    def skip(self):
        a = []

        seq1end = min(self.env.x + self.env.win_size - 1, self.env.sizeS1 - 1)
        seq2end = min(self.env.y + self.env.win_size - 1, self.env.sizeS2 - 1)
        minend = min(seq1end - self.env.x, seq2end - self.env.y)
        diff = np.where(
            self.env.seq1[self.env.x:self.env.x + minend +
                          1] != self.env.seq2[self.env.y:self.env.y + minend +
                                              1])
        if np.size(diff) > 0:
            a = [0] * np.min(diff)
        else:
            a = [0] * minend

        return a

    def reverseskip(self):
        a = []

        seq1end = max(self.env.x - self.env.win_size + 1, 0)
        seq2end = max(self.env.y - self.env.win_size + 1, 0)
        minend = min(self.env.x - seq1end, self.env.y - seq2end)
        diff = np.where(
            self.env.seq1[self.env.x - minend:self.env.x + 1][::-1] !=
            self.env.seq2[self.env.y - minend:self.env.y + 1][::-1])
        if np.size(diff) > 0:
            a = [0] * np.max(diff)
        else:
            a = [0] * minend

        return a

    def skipRC(self):
        a = []

        seq1end = min(self.env.x + self.env.win_size - 1, self.env.sizeS1 - 1)
        seq2end = max(self.env.y - self.env.win_size + 1, 0)
        minend = min(seq1end - self.env.x, self.env.y - seq2end)
        diff = np.where(
            self.env.seq1[self.env.x:self.env.x + minend +
                          1] != self.env.rev2[self.env.y - minend:self.env.y +
                                              1][::-1])
        if np.size(diff) > 0:
            a = [0] * np.min(diff)
        else:
            a = [0] * minend

        return a

    def reverseskipRC(self):
        a = []

        seq1end = max(self.env.x - self.env.win_size + 1, 0)
        seq2end = min(self.env.y + self.env.win_size - 1, self.env.sizeS2 - 1)
        minend = min(self.env.x - seq1end, seq2end - self.env.y)
        diff = np.where(
            self.env.seq1[self.env.x - minend:self.env.x +
                          1][::-1] != self.env.rev2[self.env.y:self.env.y +
                                                    minend + 1])
        if np.size(diff) > 0:
            a = [0] * np.max(diff)
        else:
            a = [0] * minend

        return a

    def metatrain(self, sess, mainBuffer=False, X=0):
        episodeBuffer = experience_buffer()
        if self.istrain:
            # Environment reset for each episode
            s1 = self.env.reset(
            )  # Rendered image of the alignment environment
            s1 = processState(s1)
        else:
            s1 = processState(self.env.renderEnv())

        d = False  # The state of the game (End or Not)
        j = 0
        rT1 = 0  # Total reward
        rT2 = 0  # Total reward
        best = 0
        flag = False

        while j < self.env.sizeS1 + self.env.sizeS2:  # Training step is proceeded until the maximum episode length
            if self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                a = self.skip()
            else:
                s1 = processState(self.env.renderEnv())
                a = sess.run(self.tempQN.predict,
                             feed_dict={self.tempQN.scalarInput: [s1]})

            #print(self.env.x,self.env.y,a,self.env.seq1[self.env.x],self.env.seq2[self.env.y],j,rT1)
            # Update the DQN network
            # Calculate the change of the state, reward and d(one)
            for _ in range(np.size(a)):
                s = s1
                s1, r, d = self.env.step(a[_])
                s1 = processState(s1)
                episodeBuffer.add(
                    np.reshape(np.array([s, a[_], r, s1, d]),
                               [1, 5]))  # Save the result into episode buffer
                rT1 += r
                rT2 += (r > 0)
                j += 1
                if rT1 >= best:
                    best = rT1

                # if score drops more than X, extension will be ended
                if (rT1 < best - X) and (X > 0):
                    flag = True
                    break

                if d == True:
                    break

            if (j % self.param.update_freq
                    == 0) and (j >= self.param.batch_size) and self.istrain:
                #print(j, self.env.x, self.env.y)
                # update the temp network
                trainBatch = episodeBuffer.sample(
                    self.param.batch_size
                )  # Select the batch from the experience buffer

                # The estimated Q value from main network is Q1, from target network is Q2
                Q1 = sess.run(self.tempQN.predict,
                              feed_dict={
                                  self.tempQN.scalarInput:
                                  np.vstack(trainBatch[:, 3])
                              })
                Q2 = sess.run(self.tempQN.Qout,
                              feed_dict={
                                  self.tempQN.scalarInput:
                                  np.vstack(trainBatch[:, 3])
                              })

                # trainBatch[:,4] means that the action was the last step of the episode
                # If the action is the last step, the reward is used for update Q value
                # IF not, the Q value is updated as follows:
                # Qmain(s,a) = r(s,a) + yQtarget(s1,argmaxQmain(s1,a))
                end_multiplier = -(trainBatch[:, 4] - 1)
                doubleQ = Q2[range(self.param.batch_size), Q1]
                targetQ = trainBatch[:, 2] + (self.param.y * doubleQ *
                                              end_multiplier)
                _ = sess.run(self.tempQN.updateModel,
                             feed_dict={
                                 self.tempQN.scalarInput:
                                 np.vstack(trainBatch[:, 0]),
                                 self.tempQN.targetQ:
                                 targetQ,
                                 self.tempQN.actions:
                                 trainBatch[:, 1]
                             })

            if d == True:
                break

            if flag == True:
                break

        if self.istrain:
            # Environment reset for each episode
            s1 = self.env.reset(
            )  # Rendered image of the alignment environment
            s1 = processState(s1)

            d = False  # The state of the game (End or Not)
            j = 0
            rT1 = 0  # Total reward
            rT2 = 0  # Total reward

            while j < self.env.sizeS1 + self.env.sizeS2:  # Training step is proceeded until the maximum episode length
                if self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                    a = self.skip()
                else:
                    s1 = processState(self.env.renderEnv())
                    a = sess.run(self.tempQN.predict,
                                 feed_dict={self.tempQN.scalarInput: [s1]})

                # Update the DQN network
                # Calculate the change of the state, reward and d(one)
                for _ in range(np.size(a)):
                    s = s1
                    s1, r, d = self.env.step(a[_])
                    s1 = processState(s1)
                    mainBuffer.add(
                        np.reshape(
                            np.array([s, a[_], r, s1, d]),
                            [1, 5]))  # Save the result into episode buffer
                    rT1 += r
                    rT2 += (r > 0)
                    j += 1

                    if d == True:
                        break

                if d == True:
                    break

        return rT1, rT2, j, mainBuffer

    def metatrain2(self, sess, mainBuffer=False, X=0):
        episodeBuffer = experience_buffer()
        if self.istrain:
            # Environment reset for each episode
            s1 = self.env.reset(
                2)  # Rendered image of the alignment environment
            s1 = processState(s1)
        else:
            s1 = processState(self.env.renderDiff())

        d = False  # The state of the game (End or Not)
        j = 0
        rT1 = 0  # Total reward
        rT2 = 0  # Total reward
        best = 0
        flag = False

        while j < self.env.sizeS1 + self.env.sizeS2:  # Training step is proceeded until the maximum episode length
            if self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                a = self.skip()
            else:
                s1 = processState(self.env.renderDiff())
                a = sess.run(self.tempQN.predict,
                             feed_dict={self.tempQN.scalarInput: [s1]})

            #print(self.env.x,self.env.y,a,self.env.seq1[self.env.x],self.env.seq2[self.env.y],j,rT1)
            # Update the DQN network
            # Calculate the change of the state, reward and d(one)
            for _ in range(np.size(a)):
                s = s1
                s1, r, d = self.env.stepDiff(a[_])
                s1 = processState(s1)
                episodeBuffer.add(
                    np.reshape(np.array([s, a[_], r, s1, d]),
                               [1, 5]))  # Save the result into episode buffer
                rT1 += r
                rT2 += (r > 0)
                j += 1
                if rT1 >= best:
                    best = rT1

                # if score drops more than X, extension will be ended
                if (rT1 < best - X) and (X > 0):
                    flag = True
                    break

                if d == True:
                    break

            if (j % self.param.update_freq
                    == 0) and (j >= self.param.batch_size) and self.istrain:
                #print(j, self.env.x, self.env.y)
                # update the temp network
                trainBatch = episodeBuffer.sample(
                    self.param.batch_size
                )  # Select the batch from the experience buffer

                # The estimated Q value from main network is Q1, from target network is Q2
                Q1 = sess.run(self.tempQN.predict,
                              feed_dict={
                                  self.tempQN.scalarInput:
                                  np.vstack(trainBatch[:, 3])
                              })
                Q2 = sess.run(self.tempQN.Qout,
                              feed_dict={
                                  self.tempQN.scalarInput:
                                  np.vstack(trainBatch[:, 3])
                              })

                # trainBatch[:,4] means that the action was the last step of the episode
                # If the action is the last step, the reward is used for update Q value
                # IF not, the Q value is updated as follows:
                # Qmain(s,a) = r(s,a) + yQtarget(s1,argmaxQmain(s1,a))
                end_multiplier = -(trainBatch[:, 4] - 1)
                doubleQ = Q2[range(self.param.batch_size), Q1]
                targetQ = trainBatch[:, 2] + (self.param.y * doubleQ *
                                              end_multiplier)
                _ = sess.run(self.tempQN.updateModel,
                             feed_dict={
                                 self.tempQN.scalarInput:
                                 np.vstack(trainBatch[:, 0]),
                                 self.tempQN.targetQ:
                                 targetQ,
                                 self.tempQN.actions:
                                 trainBatch[:, 1]
                             })

            if d == True:
                break

            if flag == True:
                break

        if self.istrain:
            # Environment reset for each episode
            s1 = self.env.reset(
                2)  # Rendered image of the alignment environment
            s1 = processState(s1)

            d = False  # The state of the game (End or Not)
            j = 0
            rT1 = 0  # Total reward
            rT2 = 0  # Total reward

            while j < self.env.sizeS1 + self.env.sizeS2:  # Training step is proceeded until the maximum episode length
                if self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                    a = self.skip()
                else:
                    s1 = processState(self.env.renderDiff())
                    a = sess.run(self.tempQN.predict,
                                 feed_dict={self.tempQN.scalarInput: [s1]})

                # Update the DQN network
                # Calculate the change of the state, reward and d(one)
                for _ in range(np.size(a)):
                    s = s1
                    s1, r, d = self.env.stepDiff(a[_])
                    s1 = processState(s1)
                    mainBuffer.add(
                        np.reshape(
                            np.array([s, a[_], r, s1, d]),
                            [1, 5]))  # Save the result into episode buffer
                    rT1 += r
                    rT2 += (r > 0)
                    j += 1

                    if d == True:
                        break

                if d == True:
                    break

        return rT1, rT2, j, mainBuffer

    def Global(self, sess, record=0):
        # Newly define experience buffer for new episode
        past = time.time()
        if self.FLAGS.show_align:
            dot_plot = 255 * np.ones((self.env.sizeS1, self.env.sizeS2))
        if self.FLAGS.print_align:
            Nucleotide = ["N", "A", "C", "G", "T"]
        if self.istrain:
            episodeBuffer = experience_buffer()
            # Environment reset for each episode
            s1 = self.env.reset(
            )  # Rendered image of the alignment environment
            s1 = processState(s1)  # Resize to 1-dimensional vector
        else:
            s = processState(self.env.renderEnv())

        d = False  # The state of the game (End or Not)
        rT1 = 0  # Total reward
        rT2 = 0  # Total match
        j = 0

        while j < self.env.sizeS1 + self.env.sizeS2:  # Training step is proceeded until the maximum episode length
            #print(self.env.x, self.env.y)
            if self.FLAGS.display_process:
                if j % 1000 == 0:
                    now = time.time()

            # Exploration step
            if self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                a = self.skip()
            elif self.istrain:
                if self.FLAGS.exploration == "e-greedy":
                    if (np.random.rand(1) < self.e
                            or self.total_steps < self.param.pre_train_steps):
                        a = [np.random.randint(0, self.param.n_action)]
                    else:
                        s1 = processState(self.env.renderEnv())
                        a = sess.run(self.mainQN.predict,
                                     feed_dict={self.mainQN.scalarInput: [s1]})
                elif self.FLAGS.exploration == "boltzmann":
                    temp = self.e
                    s1 = processState(self.env.renderEnv())
                    Qprobs = sess.run(self.mainQN.Qdist,
                                      feed_dict={
                                          self.mainQN.scalarInput: [s1],
                                          self.mainQN.Temp: [temp]
                                      })
                    action_value = np.random.choice(Qprobs[0], p=Qprobs[0])
                    a = [np.argmax(Qprobs[0] == action_value)]
                elif self.FLAGS.exploration == "bayesian":
                    keep = 1 - self.e
                    temp = self.e
                    s1 = processState(self.env.renderEnv())
                    Qprobs = sess.run(self.mainQN.Qdist,
                                      feed_dict={
                                          self.mainQN.scalarInput: [s1],
                                          self.mainQN.Temp: [temp],
                                          self.mainQN.keep_per: [keep]
                                      })
                    action_value = np.random.choice(Qprobs[0], p=Qprobs[0])
                    a = [np.argmax(Qprobs[0] == action_value)]
            else:
                #test = time.time()
                s1 = processState(self.env.renderEnv())
                #print("Rendering stage :",time.time()-test)
                #test = time.time()
                a = sess.run(self.mainQN.predict,
                             feed_dict={self.mainQN.scalarInput: [s1]})
                #print("Prediction stage :",time.time()-test)
                #test = time.time()

            # Update the DQN network
            if self.istrain:
                # Calculate the change of the state, reward and done
                s = s1
                s1, r, d = self.env.step(a[0])
                j += 1
                s1 = processState(s1)
                self.total_steps += 1
                rT1 += r
                rT2 += (r > 0)
                episodeBuffer.add(
                    np.reshape(np.array([s, a[0], r, s1, d]),
                               [1, 5]))  # Save the result into episode buffer

                if self.total_steps > self.param.pre_train_steps:
                    # Refresh exploration probability (epsilon-greedy)
                    if self.e > self.param.endE:
                        self.e -= self.stepDrop

                    # For every update_freq, update the main network
                    if self.total_steps % (self.param.update_freq) == 0:
                        self.train(sess)
                        #print("Training stage :",time.time()-test)
            else:
                for _ in range(np.size(a)):
                    if self.FLAGS.show_align:
                        dot_plot[self.env.x][self.env.y] = 0
                    if self.FLAGS.print_align:
                        record.record(
                            self.env.x, self.env.y, a[_],
                            Nucleotide[self.env.seq1[self.env.x] + 1],
                            Nucleotide[self.env.seq2[self.env.y] + 1])

                    r, d = self.env.teststep(a[_])
                    j += 1
                    rT1 += r
                    rT2 += (r > 0)
                    if d == True:
                        break
                #print("Do step stage :",time.time()-test)

            if d == True:
                break

            if self.FLAGS.display_process:
                if j % 1000 == 1000 - 1:
                    print("Align step is processed :", j + 1, "with",
                          time.time() - now)

        # Add the results of the episode into the total results
        if self.istrain:
            self.myBuffer.add(episodeBuffer.buffer)

        now = time.time()
        if self.FLAGS.show_align and self.FLAGS.print_align:
            return rT1, rT2, now - past, j, dot_plot
        elif self.FLAGS.show_align:
            return rT1, rT2, now - past, j, dot_plot
        elif self.FLAGS.print_align:
            return rT1, rT2, now - past, j
        return rT1, rT2, now - past, j

    def DiffGlobal(self, sess, record=0):
        # Newly define experience buffer for new episode
        past = time.time()
        if self.FLAGS.show_align:
            dot_plot = 255 * np.ones((self.env.sizeS1, self.env.sizeS2))
        if self.FLAGS.print_align:
            Nucleotide = ["N", "A", "C", "G", "T"]
        if self.istrain:
            episodeBuffer = experience_buffer()
            # Environment reset for each episode
            s1 = self.env.reset(
                2)  # Rendered image of the alignment environment
            s1 = processState(s1)  # Resize to 1-dimensional vector
        else:
            s = processState(self.env.renderDiff())

        d = False  # The state of the game (End or Not)
        rT1 = 0  # Total reward
        rT2 = 0  # Total match
        j = 0

        while j < self.env.sizeS1 + self.env.sizeS2:  # Training step is proceeded until the maximum episode length
            if self.FLAGS.display_process:
                if j % 1000 == 0:
                    now = time.time()

            # Exploration step
            if self.istrain and (np.random.rand(1) < self.e or self.total_steps
                                 < self.param.pre_train_steps):
                a = [np.random.randint(0, self.param.n_action)]
            elif self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                a = self.skip()
            else:
                #test = time.time()
                s1 = processState(self.env.renderDiff())
                #print("Rendering stage :",time.time()-test)
                #test = time.time()
                a = sess.run(self.mainQN.predict,
                             feed_dict={self.mainQN.scalarInput: [s1]})
                #print("Prediction stage :",time.time()-test)
                #test = time.time()

            # Update the DQN network
            if self.istrain:
                # Calculate the change of the state, reward and d(one)
                s = s1
                s1, r, d = self.env.stepDiff(a[0])
                j += 1
                s1 = processState(s1)
                self.total_steps += 1
                rT1 += r
                rT2 += (r > 0)
                episodeBuffer.add(
                    np.reshape(np.array([s, a[0], r, s1, d]),
                               [1, 5]))  # Save the result into episode buffer

                if self.total_steps > self.param.pre_train_steps:
                    # Refresh exploration probability (epsilon-greedy)
                    if self.e > self.param.endE:
                        self.e -= self.stepDrop

                    # For every update_freq, update the main network
                    if self.total_steps % (self.param.update_freq) == 0:
                        self.train(sess)
                        #print("Training stage :",time.time()-test)
            else:
                for _ in range(np.size(a)):
                    if self.FLAGS.show_align:
                        dot_plot[self.env.x][self.env.y] = 0
                    if self.FLAGS.print_align:
                        record.record(
                            self.env.x, self.env.y, a[_],
                            Nucleotide[self.env.seq1[self.env.x] + 1],
                            Nucleotide[self.env.seq2[self.env.y] + 1])

                    r, d = self.env.teststep(a[_])
                    j += 1
                    rT1 += r
                    rT2 += (r > 0)
                    if d == True:
                        break
                #print("Do step stage :",time.time()-test)

            if d == True:
                break

            if self.FLAGS.display_process:
                if j % 1000 == 1000 - 1:
                    print("Align step is processed :", j + 1, "with",
                          time.time() - now)

        # Add the results of the episode into the total results
        if self.istrain:
            self.myBuffer.add(episodeBuffer.buffer)

        now = time.time()
        if self.FLAGS.show_align and self.FLAGS.print_align:
            return rT1, rT2, now - past, j, dot_plot
        elif self.FLAGS.show_align:
            return rT1, rT2, now - past, j, dot_plot
        elif self.FLAGS.print_align:
            return rT1, rT2, now - past, j
        return rT1, rT2, now - past, j

    def FFTGlobal(self, sess, record=0):
        # Newly define experience buffer for new episode
        past = time.time()
        if self.FLAGS.show_align:
            dot_plot = 255 * np.ones((self.env.sizeS1, self.env.sizeS2))
        if self.FLAGS.print_align:
            Nucleotide = ["N", "A", "C", "G", "T"]
        if self.istrain:
            episodeBuffer = experience_buffer()
            # Environment reset for each episode
            s1 = self.env.reset(
                3)  # Rendered image of the alignment environment
            s1 = processState(s1)  # Resize to 1-dimensional vector
        else:
            s = processState(self.env.renderFFT())

        d = False  # The state of the game (End or Not)
        rT1 = 0  # Total reward
        rT2 = 0  # Total match
        j = 0

        while j < self.env.sizeS1 + self.env.sizeS2:  # Training step is proceeded until the maximum episode length
            if self.FLAGS.display_process:
                if j % 1000 == 0:
                    now = time.time()

            # Exploration step
            if self.istrain and (np.random.rand(1) < self.e or self.total_steps
                                 < self.param.pre_train_steps):
                a = [np.random.randint(0, self.param.n_action)]
            elif self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                a = self.skip()
            else:
                #test = time.time()
                s1 = processState(self.env.renderFFT())
                #print("Rendering stage :",time.time()-test)
                #test = time.time()
                a = sess.run(self.mainQN.predict,
                             feed_dict={self.mainQN.scalarInput: [s1]})
                #print("Prediction stage :",time.time()-test)
                #test = time.time()

            # Update the DQN network
            if self.istrain:
                # Calculate the change of the state, reward and d(one)
                s = s1
                s1, r, d = self.env.stepFFT(a[0])
                j += 1
                s1 = processState(s1)
                self.total_steps += 1
                rT1 += r
                rT2 += (r > 0)
                episodeBuffer.add(
                    np.reshape(np.array([s, a[0], r, s1, d]),
                               [1, 5]))  # Save the result into episode buffer

                if self.total_steps > self.param.pre_train_steps:
                    # Refresh exploration probability (epsilon-greedy)
                    if self.e > self.param.endE:
                        self.e -= self.stepDrop

                    # For every update_freq, update the main network
                    if self.total_steps % (self.param.update_freq) == 0:
                        self.train(sess)
                        #print("Training stage :",time.time()-test)
            else:
                for _ in range(np.size(a)):
                    if self.FLAGS.show_align:
                        dot_plot[self.env.x][self.env.y] = 0
                    if self.FLAGS.print_align:
                        record.record(
                            self.env.x, self.env.y, a[_],
                            Nucleotide[self.env.seq1[self.env.x] + 1],
                            Nucleotide[self.env.seq2[self.env.y] + 1])

                    r, d = self.env.teststep(a[_])
                    j += 1
                    rT1 += r
                    rT2 += (r > 0)
                    if d == True:
                        break
                #print("Do step stage :",time.time()-test)

            if d == True:
                break

            if self.FLAGS.display_process:
                if j % 1000 == 1000 - 1:
                    print("Align step is processed :", j + 1, "with",
                          time.time() - now)

        # Add the results of the episode into the total results
        if self.istrain:
            self.myBuffer.add(episodeBuffer.buffer)

        now = time.time()
        if self.FLAGS.show_align and self.FLAGS.print_align:
            return rT1, rT2, now - past, j, dot_plot
        elif self.FLAGS.show_align:
            return rT1, rT2, now - past, j, dot_plot
        elif self.FLAGS.print_align:
            return rT1, rT2, now - past, j
        return rT1, rT2, now - past, j

    def Local(self, sess, uX1, uX2, uY1, uY2, X):
        # Reverse Complement 기능 추가해야함
        # Newly define experience buffer for new episode
        if uY1 < uY2:
            RCmode = 0
        else:
            RCmode = 1

        past = time.time()

        rT1o = 0
        rT2o = 0

        pathx = []
        pathy = []

        d = False  # The state of the game (End or Not)
        rT1 = 1  # Total reward
        rT2 = 1  # Total match
        j = 0

        if RCmode == 0:
            best = 1
            best2 = 1
            flag = 0
            pathx1 = []
            pathy1 = []

            #Forward Extension
            if (uX2 + 1 <= self.env.sizeS1) and (uY2 + 1 <= self.env.sizeS2):
                self.env.x = uX2 + 1
                self.env.y = uY2 + 1
                pathx1.append(self.env.x)
                pathy1.append(self.env.y)
                bestxy = [self.env.x, self.env.y]

                while j < self.env.sizeS1 + self.env.sizeS2 - uX2 - uY2:
                    # Skip process
                    if self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                        a = self.skip()
                    else:
                        #test = time.time()
                        s1 = processState(self.env.renderEnv())
                        #print("Rendering stage :",time.time()-test)
                        #test = time.time()
                        a = sess.run(self.mainQN.predict,
                                     feed_dict={self.mainQN.scalarInput: [s1]})
                        #print("Prediction stage :",time.time()-test)
                        #test = time.time()

                    for _ in range(np.size(a)):
                        r, d = self.env.teststep(a[_])
                        pathx1.append(self.env.x)
                        pathy1.append(self.env.y)
                        j += 1
                        rT1 += r
                        rT2 += (r > 0)
                        if rT1 >= best:
                            best = rT1
                            best2 = rT2
                            bestxy = [self.env.x, self.env.y]

                        # if score drops more than X, extension will be ended
                        if rT1 < best - X:
                            flag = 1
                            break

                        if d == True:
                            flag = 1
                            break
                        #print("Do step stage :",time.time()-test)

                    if flag:
                        break

                bestp = function.check_where(pathx1, pathy1, bestxy)
                pathx1 = pathx1[:bestp + 1]
                pathy1 = pathy1[:bestp + 1]

            rT1o += best
            rT2o += best2

            d = False  # The state of the game (End or Not)
            rT1 = 1  # Total reward
            rT2 = 1  # Total match
            j = 0

            best = 1
            best2 = 1
            flag = 0
            pathx2 = []
            pathy2 = []

            #Reverse Extension
            if (uX1 - 1 >= 0) and (uY1 - 1 >= 0):
                self.env.x = uX1 - 1
                self.env.y = uY1 - 1
                pathx2.append(self.env.x)
                pathy2.append(self.env.y)
                bestxy = [self.env.x, self.env.y]

                while j < uX1 + uY1:
                    # Skip process
                    if self.env.seq1[self.env.x] == self.env.seq2[self.env.y]:
                        a = self.reverseskip()
                    else:
                        #test = time.time()
                        s1 = processState(self.env.renderRev())
                        #print("Rendering stage :",time.time()-test)
                        #test = time.time()
                        a = sess.run(self.mainQN.predict,
                                     feed_dict={self.mainQN.scalarInput: [s1]})
                        #print("Prediction stage :",time.time()-test)
                        #test = time.time()

                    for _ in range(np.size(a)):
                        r, d = self.env.teststep(10 + a[_])
                        pathx2.append(self.env.x)
                        pathy2.append(self.env.y)
                        j += 1
                        rT1 += r
                        rT2 += (r > 0)
                        if rT1 >= best:
                            best = rT1
                            best2 = rT2
                            bestxy = [self.env.x, self.env.y]

                        # if score drops more than X, extension will be ended
                        if rT1 < best - X:
                            flag = 1
                            break

                        if d == True:
                            flag = 1
                            break
                        #print("Do step stage :",time.time()-test)

                    if flag:
                        break

                bestp = function.check_where(pathx2, pathy2, bestxy)
                pathx2 = pathx2[:bestp + 1]
                pathy2 = pathy2[:bestp + 1]

            pathx = pathx2[::-1] + list(range(uX1, uX2 + 1)) + pathx1
            pathy = pathy2[::-1] + list(range(uY1, uY2 + 1)) + pathy1

            rT1o += best
            rT2o += best2

            same = np.sum(
                np.array(self.env.seq1[list(range(uX1, uX2 + 1))]) == np.array(
                    self.env.seq2[list(range(uY1, uY2 + 1))]))
            length = uX2 - uX1 + 1

            rT1o += self.env.reward[0] * same + self.env.reward[1] * (length -
                                                                      same)
            rT2o += same

            path = [pathx, pathy]

        else:
            best = 1
            best2 = 1
            flag = 0
            pathx1 = []
            pathy1 = []

            #Forward Extension
            if (uX2 + 1 <= self.env.sizeS1) and (uY2 - 1 >= 0):
                self.env.x = uX2 + 1
                self.env.y = uY1 - 1
                pathx1.append(self.env.x)
                pathy1.append(self.env.y)
                bestxy = [self.env.x, self.env.y]

                while j < self.env.sizeS1 - uX2 + uY2:
                    # Skip process
                    if self.env.seq1[self.env.x] == self.env.rev2[self.env.y]:
                        a = self.skipRC()
                    else:
                        #test = time.time()
                        s1 = processState(self.env.renderRC())
                        #print("Rendering stage :",time.time()-test)
                        #test = time.time()
                        a = sess.run(self.mainQN.predict,
                                     feed_dict={self.mainQN.scalarInput: [s1]})
                        #print("Prediction stage :",time.time()-test)
                        #test = time.time()

                    for _ in range(np.size(a)):
                        r, d = self.env.stepRC(a[_])
                        pathx1.append(self.env.x)
                        pathy1.append(self.env.y)
                        j += 1
                        rT1 += r
                        rT2 += (r > 0)
                        if rT1 >= best:
                            best = rT1
                            best2 = rT2
                            bestxy = [self.env.x, self.env.y]

                        # if score drops more than X, extension will be ended
                        if rT1 < best - X:
                            flag = 1
                            break

                        if d == True:
                            flag = 1
                            break
                        #print("Do step stage :",time.time()-test)

                    if flag:
                        break

                bestp = function.check_where(pathx1, pathy1, bestxy)
                pathx1 = pathx1[:bestp + 1]
                pathy1 = pathy1[:bestp + 1]

            rT1o += best
            rT2o += best2

            d = False  # The state of the game (End or Not)
            rT1 = 1  # Total reward
            rT2 = 1  # Total match
            j = 0

            best = 1
            best2 = 1
            flag = 0
            pathx2 = []
            pathy2 = []

            #Reverse Extension
            if (uX1 - 1 >= 0) and (uY1 + 1 <= self.env.sizeS2):
                self.env.x = uX1 - 1
                self.env.y = uY1 + 1
                pathx2.append(self.env.x)
                pathy2.append(self.env.y)
                bestxy = [self.env.x, self.env.y]

                while j < uX1 + self.env.sizeS2 - uY1:
                    # Skip process
                    if self.env.seq1[self.env.x] == self.env.rev2[self.env.y]:
                        a = self.reverseskipRC()
                    else:
                        #test = time.time()
                        s1 = processState(self.env.renderRCRev())
                        #print("Rendering stage :",time.time()-test)
                        #test = time.time()
                        a = sess.run(self.mainQN.predict,
                                     feed_dict={self.mainQN.scalarInput: [s1]})
                        #print("Prediction stage :",time.time()-test)
                        #test = time.time()

                    for _ in range(np.size(a)):
                        r, d = self.env.stepRC(10 + a[_])
                        pathx2.append(self.env.x)
                        pathy2.append(self.env.y)
                        j += 1
                        rT1 += r
                        rT2 += (r > 0)
                        if rT1 >= best:
                            best = rT1
                            best2 = rT2
                            bestxy = [self.env.x, self.env.y]

                        # if score drops more than X, extension will be ended
                        if rT1 < best - X:
                            flag = 1
                            break

                        if d == True:
                            flag = 1
                            break
                        #print("Do step stage :",time.time()-test)

                    if flag:
                        break

                bestp = function.check_where(pathx2, pathy2, bestxy)
                pathx2 = pathx2[:bestp + 1]
                pathy2 = pathy2[:bestp + 1]

            pathx = pathx2[::-1] + list(range(uX1, uX2 + 1)) + pathx1
            pathy = pathy2[::-1] + list(range(uY1, uY2 - 1, -1)) + pathy1

            rT1o += best
            rT2o += best2

            same = np.sum(
                np.array(self.env.seq1[list(range(uX1, uX2 + 1))]) == np.array(
                    self.env.rev2[list(range(uY1, uY2 - 1, -1))]))
            length = uX2 - uX1 + 1

            rT1o += self.env.reward[0] * same + self.env.reward[1] * (length -
                                                                      same)
            rT2o += same

            path = [pathx, pathy]

        now = time.time()

        return rT1o, rT2o, now - past, j, path
Пример #7
0
    resume = False
""" Main test step """
sess = tf.InteractiveSession()
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()

if FLAGS.use_GPU:
    sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}))
else:
    sess = tf.Session(config=tf.ConfigProto(device_count={'CPU': 0}))

sess.run(init)

param = import_module('DQNalign.param.' + FLAGS.network_set)
""" Define sequence alignment environment """
env = Pairwise(train_env, 0, Z=train_model.param.Z)

mainQN = train_model.mainQN
targetQN = train_model.targetQN
trainables = train_model.trainables
targetOps = train_model.targetOps
""" Initialize the variables """
total_steps = 0
start = time.time()
myBuffer = experience_buffer()

print('Loading Model...')
ckpt = tf.train.get_checkpoint_state(train_env.path)
saver.restore(sess, ckpt.model_checkpoint_path)

s = env.reset()  # Rendered image of the alignment environment