Пример #1
0
class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 10
        self.TRANSFER = 10
        self.BATCH_SIZE = 128
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.99
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 5.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        # calculate distance matrix
        state_feature_batch = self.targetNet.getstate(state_batch)
        inner_product = state_feature_batch.matmul(
            state_feature_batch.transpose(1, 0))
        state_feature_batch_l2 = (state_feature_batch**2).sum(
            dim=1, keepdim=True).expand_as(inner_product)
        distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose(
            1, 0) - 2 * inner_product
        #print('distance state')
        #print(distance_matrix.data)
        # calculate Q value ditance matrix
        # Here use target value to calculate
        Q_dist_matrix = target_batch.expand_as(distance_matrix)
        Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose(
            1, 0)  # not absolute value
        Q_dist_matrix = Q_dist_matrix.abs()
        #print('distance q')
        #print(Q_dist_matrix.data)
        # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j])
        # only consider same actions
        Action_Mask = (action_batch.expand_as(distance_matrix)) == (
            action_batch.transpose(1, 0).expand_as(distance_matrix))
        Mask = (distance_matrix.data <=
                (self.SAMPLE_S)) & (Q_dist_matrix.data <=
                                    self.SAMPLE_Q) & Action_Mask.data
        Cluster = []
        #print('mask')
        counter = 0
        while True:
            # clustering by VERTEX-COVER-ALL-VERTEX, always find largest degree
            #print('counter = {}' . format(counter))
            counter += 1

            Number = Mask.sum(dim=1)
            value, indx = Number.max(dim=0)
            #print('indx= {}' . format(indx))
            if value[0] == 0:
                # already empty
                break
            v = Mask[indx]
            #print(v)
            #print(Mask)
            Cluster.append(v)
            # delete vertices
            Delete = v.expand_as(Mask) | v.transpose(1, 0).expand_as(Mask)
            Delete = Delete ^ 1
            #Delete = v.transpose(1,0).matmul(v) ^ 1
            #print(Delete)
            Mask = Mask & Delete
        k = len(Cluster)
        Cluster = torch.cat(Cluster)
        #print('cluster')
        #print(Cluster)
        Number = Cluster.sum(dim=1).type(LongTensor)
        probability_batch = torch.ones(k) / float(k)
        cluster_is = torch.multinomial(probability_batch,
                                       self.BATCH_SIZE,
                                       replacement=True)
        # convert the cluster indices to number of items in each cluster
        Sample_num = torch.eye(k).index_select(
            0, cluster_is).sum(dim=0).type(LongTensor)
        #N = Cluster[0].size()[0] # number of vertices
        state_sample = []
        action_sample = []
        target_sample = []
        for i in range(k):
            n = Sample_num[i]
            N = Number[i]
            if n == 0:
                continue
            cluster = Cluster[i]
            # get nonzero indices
            v_indices = cluster.nonzero().squeeze(1)
            if n == N:
                # pick up all
                state_sample.append(state_batch.index_select(0, v_indices))
                action_sample.append(action_batch.index_select(0, v_indices))
                target_sample.append(target_batch.index_select(0, v_indices))
                continue
            prob = torch.ones(v_indices.size()) / n
            if n < N:
                # uniformly pick
                v_indices_is = torch.multinomial(prob, n)
                v_indices = v_indices.index_select(0, v_indices_is)
                state_sample.append(state_batch.index_select(0, v_indices))
                action_sample.append(action_batch.index_select(0, v_indices))
                target_sample.append(target_batch.index_select(0, v_indices))
                continue
            # uniformly pick with replacement
            v_indices_is = torch.multinomial(prob, n, replacement=True)
            v_indices = v_indices.index_select(0, v_indices_is)
            state_sample.append(state_batch.index_select(0, v_indices))
            action_sample.append(action_batch.index_select(0, v_indices))
            target_sample.append(target_batch.index_select(0, v_indices))
        state_batch = torch.cat(state_sample)
        action_batch = torch.cat(action_sample)
        target_batch = torch.cat(target_sample)

        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True
            if i == 50:
                pretrain = False
Пример #2
0
class Evaluator(multiprocessing.Process):
    def __init__(self, shared):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 500
        self.TRANSFER = 100
        self.BATCH_SIZE = 32
        self.GAMMA = 1.0
        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}

    def minibatch(self, exp_replay, pretrain=False):
        batch = exp_replay.sample(self.BATCH_SIZE)
        unzipped = list(zip(*batch))
        state_batch = np.concatenate(list(unzipped[0]))
        state_batch = Variable(torch.from_numpy(state_batch))
        action_batch = np.concatenate(list(unzipped[1]))
        action_batch = Variable(torch.from_numpy(action_batch))
        reward_batch = np.concatenate(list(unzipped[2]))
        reward_batch = Variable(torch.from_numpy(reward_batch),
                                requires_grad=False)
        #state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        #action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        #reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            term_batch = np.concatenate(list(unzipped[5]))
            term_batch = Variable(torch.from_numpy(term_batch), volatile=True)
            next_action_batch = np.concatenate(list(unzipped[4]))
            next_action_batch = Variable(torch.from_numpy(next_action_batch),
                                         volatile=True)
            next_state_batch = np.concatenate(list(unzipped[3]))
            next_state_batch = Variable(torch.from_numpy(next_state_batch),
                                        volatile=True)
            next_state_values = self.targetNet(next_state_batch).gather(
                1, next_action_batch)
            #term_batch = Variable(torch.cat(list(unzipped[5]).clone()), volatile=True)
            #next_action_batch = Variable(torch.cat(list(unzipped[4]).clone()), volatile=True)
            #next_state_values = self.targetNet.evaluate(list(unzipped[3]).clone()).gather(1, next_action_batch)
            next_state_values = term_batch * next_state_values
            print(next_state_values)
            next_state_values.volatile = False
            next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        pretrain = True
        while True:
            #print('evaluator starts...')
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping... size: {}'.format(len(
                    self.shared['memory'])))
                time.sleep(1)
            for step_i in range(self.TRAIN_MAX):
                # minibatch, and get the target value
                print('training... step {}'.format(step_i))
                #memory = deepcopy(self.shared['memory'])
                batch_tuple = self.minibatch(self.shared['memory'], pretrain)
                #print('got batch tuple')
                loss = self.net.optimize(batch_tuple)
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False
Пример #3
0
class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 10
        self.TRANSFER = 10
        self.BATCH_SIZE = 32
        self.GAMMA = 0.99
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        #state_batch = Variable(torch.cat(list(unzipped[0])).clone(), volatile=True)
        #action_batch = Variable(torch.cat(list(unzipped[1])).clone(), volatile=True)
        #reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), volatile=True)
        #target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            #term_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True)
            #next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            dist_norm = self.targetNet.getdistance(state_batch,
                                                   next_state_batch)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        state_values = self.net(state_batch).gather(1, action_batch)
        probability_batch = torch.pow(torch.abs(target_batch - state_values),
                                      self.SAMPLE_ALPHA).squeeze(1)
        print(probability_batch)
        sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE)
        state_batch = state_batch.index_select(0, sample_is)
        action_batch = action_batch.index_select(0, sample_is)
        target_batch = target_batch.index_select(0, sample_is)
        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False
Пример #4
0
class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 9
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.99
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 5.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        # calculate distance matrix
        state_feature_batch = self.targetNet.getstate(state_batch)
        inner_product = state_feature_batch.matmul(
            state_feature_batch.transpose(1, 0))
        state_feature_batch_l2 = (state_feature_batch**2).sum(
            dim=1, keepdim=True).expand_as(inner_product)
        distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose(
            1, 0) - 2 * inner_product
        #print('distance state')
        #print(distance_matrix.data)
        # calculate Q value ditance matrix
        # Here use target value to calculate
        Q_dist_matrix = target_batch.expand_as(distance_matrix)
        Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose(
            1, 0)  # not absolute value
        Q_dist_matrix = Q_dist_matrix.abs()
        #print('distance q')
        #print(Q_dist_matrix.data)
        # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j])
        # only consider same actions
        Action_Mask = (action_batch.expand_as(distance_matrix)) == (
            action_batch.transpose(1, 0).expand_as(distance_matrix))
        Mask = (distance_matrix.data <=
                (self.SAMPLE_S**2)) & (Q_dist_matrix.data <=
                                       self.SAMPLE_Q) & Action_Mask.data
        Mask = Mask.type(FloatTensor)
        Number = Mask.sum(dim=1, keepdim=True)
        # using the mask to calculate the number used for each transition
        probability_batch = Mask.matmul(1. / Number) / Number
        probability_batch = probability_batch.squeeze(1)
        #print(probability_batch)
        sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE)
        state_batch = state_batch.index_select(0, sample_is)
        action_batch = action_batch.index_select(0, sample_is)
        target_batch = target_batch.index_select(0, sample_is)
        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True
            if i == 50:
                pretrain = False
Пример #5
0
class Evaluator(multiprocessing.Process):
    def __init__(self, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 50
        self.TRANSFER = 50
        self.BATCH_SIZE = 32
        self.GAMMA = 0.99
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):

        batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*batch))
        #state_batch = torch.from_numpy(np.concatenate(list(unzipped[0])))
        #state_batch = Variable(state_batch)
        #action_batch = torch.from_numpy(np.concatenate(list(unzipped[1])))
        #action_batch = Variable(action_batch)
        #reward_batch = torch.from_numpy(np.concatenate(list(unzipped[2])))
        #reward_batch = Variable(reward_batch, requires_grad=False)
        state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        reward_batch = Variable(torch.cat(list(unzipped[2])).clone(),
                                requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            #term_batch = torch.from_numpy(np.concatenate(list(unzipped[5])))
            #term_batch = Variable(term_batch, volatile=True)
            term_batch = Variable(torch.cat(list(unzipped[5])).clone(),
                                  volatile=True)
            #next_action_batch = torch.from_numpy(np.concatenate(list(unzipped[4])))
            #next_action_batch = Variable(next_action_batch, volatile=True)
            next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(),
                                         volatile=True)
            #next_state = torch.from_numpy(np.concatenate(list(unzipped[3])))
            #next_state = Variable(next_state, volatile=True)
            #next_state_values = self.targetNet(next_state).gather(1, next_action_batch)
            next_state_values = self.targetNet.evaluate(list(
                unzipped[3])).gather(1, next_action_batch)
            #non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, list(unzipped[3]))))
            #non_final_next_states = Variable(torch.cat([s for s in list(unzipped[3]) if s is not None]),
            #                                 volatile=True)
            #next_state_values = Variable(torch.zeros(self.BATCH_SIZE).type(Tensor))
            #next_state_values[non_final_mask] = self.targetNet(non_final_next_states).gather(1, next_action_batch)
            #next_state_values.volatile = False
            #print(next_state_values)
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = term_batch * next_state_values
            next_state_values.volatile = False
            #next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        pretrain = True
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping... size: {}'.format(len(
                    self.shared['memory'])))
                time.sleep(0.1)
            print('training...')
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                self.semaphore.acquire()
                memory = copy.deepcopy(self.shared['memory'])
                self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue

                batch_tuple = self.minibatch(memory, pretrain)
                #print('got batch tuple')
                #print(batch_tuple[0].type)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False
Пример #6
0
class MyProcess(mp.Process):
    def __init__(self, inputs):
        mp.Process.__init__(self)
        self.BATCH_SIZE = 32
        self.TRAIN_MAX = 500
        self.TRANSFER = 100
        self.GAMMA = 1.0
        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01
        self.demonet = DQN()
        self.targetnet = DQN()
        self.copy_weights()
        self.demonet.setOptimizer(
            optim.RMSprop(self.demonet.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.inputs = inputs
        #self.demonet.setOptimizer(optim.Adam(params=self.demonet.parameters()))
    def copy_weights(self):
        self.targetnet.load_state_dict(self.demonet.state_dict())

    def minibatch(self, exp_replay, pretrain=False):
        batch = exp_replay.sample(self.BATCH_SIZE)
        unzipped = list(zip(*batch))
        #state_batch = np.concatenate(list(unzipped[0]))
        #state_batch = Variable(torch.from_numpy(state_batch))
        #action_batch = np.concatenate(list(unzipped[1]))
        #action_batch = Variable(torch.from_numpy(action_batch))
        #reward_batch = np.concatenate(list(unzipped[2]))
        #reward_batch = Variable(torch.from_numpy(reward_batch), requires_grad=False)
        state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        reward_batch = Variable(torch.cat(list(unzipped[2])).clone(),
                                requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            #term_batch = np.concatenate(list(unzipped[5]))
            #term_batch = Variable(torch.from_numpy(term_batch), volatile=True)
            #next_action_batch = np.concatenate(list(unzipped[4]))
            #next_action_batch = Variable(torch.from_numpy(next_action_batch), volatile=True)
            #next_state_batch = np.concatenate(list(unzipped[3]))
            #next_state_batch = Variable(torch.from_numpy(next_state_batch), volatile=True)
            #next_state_values = self.targetNet(next_state_batch).gather(1,next_action_batch)
            term_batch = Variable(torch.cat(list(unzipped[5]).clone()),
                                  volatile=True)
            next_action_batch = Variable(torch.cat(list(unzipped[4]).clone()),
                                         volatile=True)
            next_state_values = self.targetNet.evaluate(
                list(unzipped[3]).clone()).gather(1, next_action_batch)
            next_state_values = term_batch * next_state_values
            print(next_state_values)
            next_state_values.volatile = False
            next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def run(self):
        pretrain = True
        while True:
            while self.inputs['SENT_FLAG']:
                print('sleeping... size: {}'.format(len(
                    self.inputs['inputs'])))
                time.sleep(1)
            for step_i in range(self.TRAIN_MAX):
                sample = self.minibatch(self.inputs['inputs'], pretrain)
                self.demonet(sample[0])
                print('hello world')
                loss = self.demonet.optimize(sample)
                #time.sleep(1)
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True