Python DQN.optimize примеры использования

Язык программирования: Python

Пространство имен/Пакет: DQN.DQNcartpole

Класс/Тип: DQN

Метод/Функция: optimize

Примеров на hotexamples.com: 6

Python DQN.optimize - 6 примеров найдено. Это лучшие примеры Python кода для DQN.DQNcartpole.DQN.optimize, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

load_state_dict(6)

optimize(6)

parameters(6)

setOptimizer(6)

state_dict(6)

DQN(5)

evaluate(2)

getstate(2)

getdistance(1)

share_memory(1)

Пример #1

Показать файл

class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 10
        self.TRANSFER = 10
        self.BATCH_SIZE = 128
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.99
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 5.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        # calculate distance matrix
        state_feature_batch = self.targetNet.getstate(state_batch)
        inner_product = state_feature_batch.matmul(
            state_feature_batch.transpose(1, 0))
        state_feature_batch_l2 = (state_feature_batch**2).sum(
            dim=1, keepdim=True).expand_as(inner_product)
        distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose(
            1, 0) - 2 * inner_product
        #print('distance state')
        #print(distance_matrix.data)
        # calculate Q value ditance matrix
        # Here use target value to calculate
        Q_dist_matrix = target_batch.expand_as(distance_matrix)
        Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose(
            1, 0)  # not absolute value
        Q_dist_matrix = Q_dist_matrix.abs()
        #print('distance q')
        #print(Q_dist_matrix.data)
        # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j])
        # only consider same actions
        Action_Mask = (action_batch.expand_as(distance_matrix)) == (
            action_batch.transpose(1, 0).expand_as(distance_matrix))
        Mask = (distance_matrix.data <=
                (self.SAMPLE_S)) & (Q_dist_matrix.data <=
                                    self.SAMPLE_Q) & Action_Mask.data
        Cluster = []
        #print('mask')
        counter = 0
        while True:
            # clustering by VERTEX-COVER-ALL-VERTEX, always find largest degree
            #print('counter = {}' . format(counter))
            counter += 1

            Number = Mask.sum(dim=1)
            value, indx = Number.max(dim=0)
            #print('indx= {}' . format(indx))
            if value[0] == 0:
                # already empty
                break
            v = Mask[indx]
            #print(v)
            #print(Mask)
            Cluster.append(v)
            # delete vertices
            Delete = v.expand_as(Mask) | v.transpose(1, 0).expand_as(Mask)
            Delete = Delete ^ 1
            #Delete = v.transpose(1,0).matmul(v) ^ 1
            #print(Delete)
            Mask = Mask & Delete
        k = len(Cluster)
        Cluster = torch.cat(Cluster)
        #print('cluster')
        #print(Cluster)
        Number = Cluster.sum(dim=1).type(LongTensor)
        probability_batch = torch.ones(k) / float(k)
        cluster_is = torch.multinomial(probability_batch,
                                       self.BATCH_SIZE,
                                       replacement=True)
        # convert the cluster indices to number of items in each cluster
        Sample_num = torch.eye(k).index_select(
            0, cluster_is).sum(dim=0).type(LongTensor)
        #N = Cluster[0].size()[0] # number of vertices
        state_sample = []
        action_sample = []
        target_sample = []
        for i in range(k):
            n = Sample_num[i]
            N = Number[i]
            if n == 0:
                continue
            cluster = Cluster[i]
            # get nonzero indices
            v_indices = cluster.nonzero().squeeze(1)
            if n == N:
                # pick up all
                state_sample.append(state_batch.index_select(0, v_indices))
                action_sample.append(action_batch.index_select(0, v_indices))
                target_sample.append(target_batch.index_select(0, v_indices))
                continue
            prob = torch.ones(v_indices.size()) / n
            if n < N:
                # uniformly pick
                v_indices_is = torch.multinomial(prob, n)
                v_indices = v_indices.index_select(0, v_indices_is)
                state_sample.append(state_batch.index_select(0, v_indices))
                action_sample.append(action_batch.index_select(0, v_indices))
                target_sample.append(target_batch.index_select(0, v_indices))
                continue
            # uniformly pick with replacement
            v_indices_is = torch.multinomial(prob, n, replacement=True)
            v_indices = v_indices.index_select(0, v_indices_is)
            state_sample.append(state_batch.index_select(0, v_indices))
            action_sample.append(action_batch.index_select(0, v_indices))
            target_sample.append(target_batch.index_select(0, v_indices))
        state_batch = torch.cat(state_sample)
        action_batch = torch.cat(action_sample)
        target_batch = torch.cat(target_sample)

        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True
            if i == 50:
                pretrain = False

Пример #2

Показать файл

class Evaluator(multiprocessing.Process):
    def __init__(self, shared):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 500
        self.TRANSFER = 100
        self.BATCH_SIZE = 32
        self.GAMMA = 1.0
        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}

    def minibatch(self, exp_replay, pretrain=False):
        batch = exp_replay.sample(self.BATCH_SIZE)
        unzipped = list(zip(*batch))
        state_batch = np.concatenate(list(unzipped[0]))
        state_batch = Variable(torch.from_numpy(state_batch))
        action_batch = np.concatenate(list(unzipped[1]))
        action_batch = Variable(torch.from_numpy(action_batch))
        reward_batch = np.concatenate(list(unzipped[2]))
        reward_batch = Variable(torch.from_numpy(reward_batch),
                                requires_grad=False)
        #state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        #action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        #reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            term_batch = np.concatenate(list(unzipped[5]))
            term_batch = Variable(torch.from_numpy(term_batch), volatile=True)
            next_action_batch = np.concatenate(list(unzipped[4]))
            next_action_batch = Variable(torch.from_numpy(next_action_batch),
                                         volatile=True)
            next_state_batch = np.concatenate(list(unzipped[3]))
            next_state_batch = Variable(torch.from_numpy(next_state_batch),
                                        volatile=True)
            next_state_values = self.targetNet(next_state_batch).gather(
                1, next_action_batch)
            #term_batch = Variable(torch.cat(list(unzipped[5]).clone()), volatile=True)
            #next_action_batch = Variable(torch.cat(list(unzipped[4]).clone()), volatile=True)
            #next_state_values = self.targetNet.evaluate(list(unzipped[3]).clone()).gather(1, next_action_batch)
            next_state_values = term_batch * next_state_values
            print(next_state_values)
            next_state_values.volatile = False
            next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        pretrain = True
        while True:
            #print('evaluator starts...')
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping... size: {}'.format(len(
                    self.shared['memory'])))
                time.sleep(1)
            for step_i in range(self.TRAIN_MAX):
                # minibatch, and get the target value
                print('training... step {}'.format(step_i))
                #memory = deepcopy(self.shared['memory'])
                batch_tuple = self.minibatch(self.shared['memory'], pretrain)
                #print('got batch tuple')
                loss = self.net.optimize(batch_tuple)
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False

Пример #3

Показать файл

class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 10
        self.TRANSFER = 10
        self.BATCH_SIZE = 32
        self.GAMMA = 0.99
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        #state_batch = Variable(torch.cat(list(unzipped[0])).clone(), volatile=True)
        #action_batch = Variable(torch.cat(list(unzipped[1])).clone(), volatile=True)
        #reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), volatile=True)
        #target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            #term_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True)
            #next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            dist_norm = self.targetNet.getdistance(state_batch,
                                                   next_state_batch)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        state_values = self.net(state_batch).gather(1, action_batch)
        probability_batch = torch.pow(torch.abs(target_batch - state_values),
                                      self.SAMPLE_ALPHA).squeeze(1)
        print(probability_batch)
        sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE)
        state_batch = state_batch.index_select(0, sample_is)
        action_batch = action_batch.index_select(0, sample_is)
        target_batch = target_batch.index_select(0, sample_is)
        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False

Пример #4

Показать файл

class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 9
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.99
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 5.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        # calculate distance matrix
        state_feature_batch = self.targetNet.getstate(state_batch)
        inner_product = state_feature_batch.matmul(
            state_feature_batch.transpose(1, 0))
        state_feature_batch_l2 = (state_feature_batch**2).sum(
            dim=1, keepdim=True).expand_as(inner_product)
        distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose(
            1, 0) - 2 * inner_product
        #print('distance state')
        #print(distance_matrix.data)
        # calculate Q value ditance matrix
        # Here use target value to calculate
        Q_dist_matrix = target_batch.expand_as(distance_matrix)
        Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose(
            1, 0)  # not absolute value
        Q_dist_matrix = Q_dist_matrix.abs()
        #print('distance q')
        #print(Q_dist_matrix.data)
        # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j])
        # only consider same actions
        Action_Mask = (action_batch.expand_as(distance_matrix)) == (
            action_batch.transpose(1, 0).expand_as(distance_matrix))
        Mask = (distance_matrix.data <=
                (self.SAMPLE_S**2)) & (Q_dist_matrix.data <=
                                       self.SAMPLE_Q) & Action_Mask.data
        Mask = Mask.type(FloatTensor)
        Number = Mask.sum(dim=1, keepdim=True)
        # using the mask to calculate the number used for each transition
        probability_batch = Mask.matmul(1. / Number) / Number
        probability_batch = probability_batch.squeeze(1)
        #print(probability_batch)
        sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE)
        state_batch = state_batch.index_select(0, sample_is)
        action_batch = action_batch.index_select(0, sample_is)
        target_batch = target_batch.index_select(0, sample_is)
        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True
            if i == 50:
                pretrain = False

Пример #5

Показать файл

class Evaluator(multiprocessing.Process):
    def __init__(self, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 50
        self.TRANSFER = 50
        self.BATCH_SIZE = 32
        self.GAMMA = 0.99
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):

        batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*batch))
        #state_batch = torch.from_numpy(np.concatenate(list(unzipped[0])))
        #state_batch = Variable(state_batch)
        #action_batch = torch.from_numpy(np.concatenate(list(unzipped[1])))
        #action_batch = Variable(action_batch)
        #reward_batch = torch.from_numpy(np.concatenate(list(unzipped[2])))
        #reward_batch = Variable(reward_batch, requires_grad=False)
        state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        reward_batch = Variable(torch.cat(list(unzipped[2])).clone(),
                                requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            #term_batch = torch.from_numpy(np.concatenate(list(unzipped[5])))
            #term_batch = Variable(term_batch, volatile=True)
            term_batch = Variable(torch.cat(list(unzipped[5])).clone(),
                                  volatile=True)
            #next_action_batch = torch.from_numpy(np.concatenate(list(unzipped[4])))
            #next_action_batch = Variable(next_action_batch, volatile=True)
            next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(),
                                         volatile=True)
            #next_state = torch.from_numpy(np.concatenate(list(unzipped[3])))
            #next_state = Variable(next_state, volatile=True)
            #next_state_values = self.targetNet(next_state).gather(1, next_action_batch)
            next_state_values = self.targetNet.evaluate(list(
                unzipped[3])).gather(1, next_action_batch)
            #non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, list(unzipped[3]))))
            #non_final_next_states = Variable(torch.cat([s for s in list(unzipped[3]) if s is not None]),
            #                                 volatile=True)
            #next_state_values = Variable(torch.zeros(self.BATCH_SIZE).type(Tensor))
            #next_state_values[non_final_mask] = self.targetNet(non_final_next_states).gather(1, next_action_batch)
            #next_state_values.volatile = False
            #print(next_state_values)
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = term_batch * next_state_values
            next_state_values.volatile = False
            #next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        pretrain = True
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping... size: {}'.format(len(
                    self.shared['memory'])))
                time.sleep(0.1)
            print('training...')
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                self.semaphore.acquire()
                memory = copy.deepcopy(self.shared['memory'])
                self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue

                batch_tuple = self.minibatch(memory, pretrain)
                #print('got batch tuple')
                #print(batch_tuple[0].type)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False

Пример #6

Показать файл

class MyProcess(mp.Process):
    def __init__(self, inputs):
        mp.Process.__init__(self)
        self.BATCH_SIZE = 32
        self.TRAIN_MAX = 500
        self.TRANSFER = 100
        self.GAMMA = 1.0
        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01
        self.demonet = DQN()
        self.targetnet = DQN()
        self.copy_weights()
        self.demonet.setOptimizer(
            optim.RMSprop(self.demonet.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.inputs = inputs
        #self.demonet.setOptimizer(optim.Adam(params=self.demonet.parameters()))
    def copy_weights(self):
        self.targetnet.load_state_dict(self.demonet.state_dict())

    def minibatch(self, exp_replay, pretrain=False):
        batch = exp_replay.sample(self.BATCH_SIZE)
        unzipped = list(zip(*batch))
        #state_batch = np.concatenate(list(unzipped[0]))
        #state_batch = Variable(torch.from_numpy(state_batch))
        #action_batch = np.concatenate(list(unzipped[1]))
        #action_batch = Variable(torch.from_numpy(action_batch))
        #reward_batch = np.concatenate(list(unzipped[2]))
        #reward_batch = Variable(torch.from_numpy(reward_batch), requires_grad=False)
        state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        reward_batch = Variable(torch.cat(list(unzipped[2])).clone(),
                                requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            #term_batch = np.concatenate(list(unzipped[5]))
            #term_batch = Variable(torch.from_numpy(term_batch), volatile=True)
            #next_action_batch = np.concatenate(list(unzipped[4]))
            #next_action_batch = Variable(torch.from_numpy(next_action_batch), volatile=True)
            #next_state_batch = np.concatenate(list(unzipped[3]))
            #next_state_batch = Variable(torch.from_numpy(next_state_batch), volatile=True)
            #next_state_values = self.targetNet(next_state_batch).gather(1,next_action_batch)
            term_batch = Variable(torch.cat(list(unzipped[5]).clone()),
                                  volatile=True)
            next_action_batch = Variable(torch.cat(list(unzipped[4]).clone()),
                                         volatile=True)
            next_state_values = self.targetNet.evaluate(
                list(unzipped[3]).clone()).gather(1, next_action_batch)
            next_state_values = term_batch * next_state_values
            print(next_state_values)
            next_state_values.volatile = False
            next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def run(self):
        pretrain = True
        while True:
            while self.inputs['SENT_FLAG']:
                print('sleeping... size: {}'.format(len(
                    self.inputs['inputs'])))
                time.sleep(1)
            for step_i in range(self.TRAIN_MAX):
                sample = self.minibatch(self.inputs['inputs'], pretrain)
                self.demonet(sample[0])
                print('hello world')
                loss = self.demonet.optimize(sample)
                #time.sleep(1)
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True