Пример #1
0
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 10
        self.TRANSFER = 10
        self.BATCH_SIZE = 128
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.99
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 5.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore
Пример #2
0
 def __init__(self, inputs):
     mp.Process.__init__(self)
     self.BATCH_SIZE = 32
     self.TRAIN_MAX = 500
     self.TRANSFER = 100
     self.GAMMA = 1.0
     LEARNING_RATE = 0.00025
     MOMENTUM = 0.95
     SQUARED_MOMENTUM = 0.95
     MIN_SQUARED_GRAD = 0.01
     self.demonet = DQN()
     self.targetnet = DQN()
     self.copy_weights()
     self.demonet.setOptimizer(
         optim.RMSprop(self.demonet.parameters(),
                       lr=LEARNING_RATE,
                       momentum=MOMENTUM,
                       alpha=SQUARED_MOMENTUM,
                       eps=MIN_SQUARED_GRAD))
     self.inputs = inputs
Пример #3
0
    def __init__(self, shared):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 500
        self.TRANSFER = 100
        self.BATCH_SIZE = 32
        self.GAMMA = 1.0
        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
Пример #4
0
#from Env.Environment import Environment
from Env.gymEnv_V2 import myGym
#from Env.gymEnv import myGym
from DQN.Improver_Q_Learning import Improver
from DQN.Evaluator_Dense_Q_Learning import Evaluator
from DQN.ReplayMemory import ReplayMemory
import os
os.system(
    "taskset -p 0xff %d" % os.getpid()
)  #https://stackoverflow.com/questions/15639779/why-does-multiprocessing-use-only-a-single-core-after-i-import-numpy

if __name__ == '__main__':
    # hyperparameters
    MEMORY_SIZE = 5000
    #MEMORY_SIZE = 5
    imp_net = DQN()
    # populate memory
    # let improver populate first
    manager = SyncManager()
    manager.start()
    memory = ReplayMemory(MEMORY_SIZE)
    s = multiprocessing.Semaphore(1)
    #memory = multiprocessing.Queue(MEMORY_SIZE)
    memory = manager.list()
    shared = manager.dict({'SENT_FLAG': True, 'weights': None})
    #shared = manager.dict({'memory':memory, 'SENT_FLAG':True, 'weights':None})
    #improver = Improver(imp_net, shared, myGym(), s)
    improver = Improver(imp_net, MEMORY_SIZE, memory, shared, myGym(), s)
    # improver is executed by the main process
    evaluator = Evaluator(memory, shared, s)
Пример #5
0
class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 10
        self.TRANSFER = 10
        self.BATCH_SIZE = 128
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.99
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 5.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        # calculate distance matrix
        state_feature_batch = self.targetNet.getstate(state_batch)
        inner_product = state_feature_batch.matmul(
            state_feature_batch.transpose(1, 0))
        state_feature_batch_l2 = (state_feature_batch**2).sum(
            dim=1, keepdim=True).expand_as(inner_product)
        distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose(
            1, 0) - 2 * inner_product
        #print('distance state')
        #print(distance_matrix.data)
        # calculate Q value ditance matrix
        # Here use target value to calculate
        Q_dist_matrix = target_batch.expand_as(distance_matrix)
        Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose(
            1, 0)  # not absolute value
        Q_dist_matrix = Q_dist_matrix.abs()
        #print('distance q')
        #print(Q_dist_matrix.data)
        # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j])
        # only consider same actions
        Action_Mask = (action_batch.expand_as(distance_matrix)) == (
            action_batch.transpose(1, 0).expand_as(distance_matrix))
        Mask = (distance_matrix.data <=
                (self.SAMPLE_S)) & (Q_dist_matrix.data <=
                                    self.SAMPLE_Q) & Action_Mask.data
        Cluster = []
        #print('mask')
        counter = 0
        while True:
            # clustering by VERTEX-COVER-ALL-VERTEX, always find largest degree
            #print('counter = {}' . format(counter))
            counter += 1

            Number = Mask.sum(dim=1)
            value, indx = Number.max(dim=0)
            #print('indx= {}' . format(indx))
            if value[0] == 0:
                # already empty
                break
            v = Mask[indx]
            #print(v)
            #print(Mask)
            Cluster.append(v)
            # delete vertices
            Delete = v.expand_as(Mask) | v.transpose(1, 0).expand_as(Mask)
            Delete = Delete ^ 1
            #Delete = v.transpose(1,0).matmul(v) ^ 1
            #print(Delete)
            Mask = Mask & Delete
        k = len(Cluster)
        Cluster = torch.cat(Cluster)
        #print('cluster')
        #print(Cluster)
        Number = Cluster.sum(dim=1).type(LongTensor)
        probability_batch = torch.ones(k) / float(k)
        cluster_is = torch.multinomial(probability_batch,
                                       self.BATCH_SIZE,
                                       replacement=True)
        # convert the cluster indices to number of items in each cluster
        Sample_num = torch.eye(k).index_select(
            0, cluster_is).sum(dim=0).type(LongTensor)
        #N = Cluster[0].size()[0] # number of vertices
        state_sample = []
        action_sample = []
        target_sample = []
        for i in range(k):
            n = Sample_num[i]
            N = Number[i]
            if n == 0:
                continue
            cluster = Cluster[i]
            # get nonzero indices
            v_indices = cluster.nonzero().squeeze(1)
            if n == N:
                # pick up all
                state_sample.append(state_batch.index_select(0, v_indices))
                action_sample.append(action_batch.index_select(0, v_indices))
                target_sample.append(target_batch.index_select(0, v_indices))
                continue
            prob = torch.ones(v_indices.size()) / n
            if n < N:
                # uniformly pick
                v_indices_is = torch.multinomial(prob, n)
                v_indices = v_indices.index_select(0, v_indices_is)
                state_sample.append(state_batch.index_select(0, v_indices))
                action_sample.append(action_batch.index_select(0, v_indices))
                target_sample.append(target_batch.index_select(0, v_indices))
                continue
            # uniformly pick with replacement
            v_indices_is = torch.multinomial(prob, n, replacement=True)
            v_indices = v_indices.index_select(0, v_indices_is)
            state_sample.append(state_batch.index_select(0, v_indices))
            action_sample.append(action_batch.index_select(0, v_indices))
            target_sample.append(target_batch.index_select(0, v_indices))
        state_batch = torch.cat(state_sample)
        action_batch = torch.cat(action_sample)
        target_batch = torch.cat(target_sample)

        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True
            if i == 50:
                pretrain = False
Пример #6
0
class Evaluator(multiprocessing.Process):
    def __init__(self, shared):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 500
        self.TRANSFER = 100
        self.BATCH_SIZE = 32
        self.GAMMA = 1.0
        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}

    def minibatch(self, exp_replay, pretrain=False):
        batch = exp_replay.sample(self.BATCH_SIZE)
        unzipped = list(zip(*batch))
        state_batch = np.concatenate(list(unzipped[0]))
        state_batch = Variable(torch.from_numpy(state_batch))
        action_batch = np.concatenate(list(unzipped[1]))
        action_batch = Variable(torch.from_numpy(action_batch))
        reward_batch = np.concatenate(list(unzipped[2]))
        reward_batch = Variable(torch.from_numpy(reward_batch),
                                requires_grad=False)
        #state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        #action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        #reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            term_batch = np.concatenate(list(unzipped[5]))
            term_batch = Variable(torch.from_numpy(term_batch), volatile=True)
            next_action_batch = np.concatenate(list(unzipped[4]))
            next_action_batch = Variable(torch.from_numpy(next_action_batch),
                                         volatile=True)
            next_state_batch = np.concatenate(list(unzipped[3]))
            next_state_batch = Variable(torch.from_numpy(next_state_batch),
                                        volatile=True)
            next_state_values = self.targetNet(next_state_batch).gather(
                1, next_action_batch)
            #term_batch = Variable(torch.cat(list(unzipped[5]).clone()), volatile=True)
            #next_action_batch = Variable(torch.cat(list(unzipped[4]).clone()), volatile=True)
            #next_state_values = self.targetNet.evaluate(list(unzipped[3]).clone()).gather(1, next_action_batch)
            next_state_values = term_batch * next_state_values
            print(next_state_values)
            next_state_values.volatile = False
            next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        pretrain = True
        while True:
            #print('evaluator starts...')
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping... size: {}'.format(len(
                    self.shared['memory'])))
                time.sleep(1)
            for step_i in range(self.TRAIN_MAX):
                # minibatch, and get the target value
                print('training... step {}'.format(step_i))
                #memory = deepcopy(self.shared['memory'])
                batch_tuple = self.minibatch(self.shared['memory'], pretrain)
                #print('got batch tuple')
                loss = self.net.optimize(batch_tuple)
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False
Пример #7
0
class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 10
        self.TRANSFER = 10
        self.BATCH_SIZE = 32
        self.GAMMA = 0.99
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        #state_batch = Variable(torch.cat(list(unzipped[0])).clone(), volatile=True)
        #action_batch = Variable(torch.cat(list(unzipped[1])).clone(), volatile=True)
        #reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), volatile=True)
        #target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            #term_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True)
            #next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            dist_norm = self.targetNet.getdistance(state_batch,
                                                   next_state_batch)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        state_values = self.net(state_batch).gather(1, action_batch)
        probability_batch = torch.pow(torch.abs(target_batch - state_values),
                                      self.SAMPLE_ALPHA).squeeze(1)
        print(probability_batch)
        sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE)
        state_batch = state_batch.index_select(0, sample_is)
        action_batch = action_batch.index_select(0, sample_is)
        target_batch = target_batch.index_select(0, sample_is)
        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False
Пример #8
0
class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 9
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.99
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 5.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(unzipped[0])),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])),
                                        volatile=True)
            #print('average distance: {}' . format(dist_norm))
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch)
            #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True)
            #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch)
            #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0]))
            #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0]))
            next_state_values = term_batch * next_state_values

            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        # calculate distance matrix
        state_feature_batch = self.targetNet.getstate(state_batch)
        inner_product = state_feature_batch.matmul(
            state_feature_batch.transpose(1, 0))
        state_feature_batch_l2 = (state_feature_batch**2).sum(
            dim=1, keepdim=True).expand_as(inner_product)
        distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose(
            1, 0) - 2 * inner_product
        #print('distance state')
        #print(distance_matrix.data)
        # calculate Q value ditance matrix
        # Here use target value to calculate
        Q_dist_matrix = target_batch.expand_as(distance_matrix)
        Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose(
            1, 0)  # not absolute value
        Q_dist_matrix = Q_dist_matrix.abs()
        #print('distance q')
        #print(Q_dist_matrix.data)
        # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j])
        # only consider same actions
        Action_Mask = (action_batch.expand_as(distance_matrix)) == (
            action_batch.transpose(1, 0).expand_as(distance_matrix))
        Mask = (distance_matrix.data <=
                (self.SAMPLE_S**2)) & (Q_dist_matrix.data <=
                                       self.SAMPLE_Q) & Action_Mask.data
        Mask = Mask.type(FloatTensor)
        Number = Mask.sum(dim=1, keepdim=True)
        # using the mask to calculate the number used for each transition
        probability_batch = Mask.matmul(1. / Number) / Number
        probability_batch = probability_batch.squeeze(1)
        #print(probability_batch)
        sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE)
        state_batch = state_batch.index_select(0, sample_is)
        action_batch = action_batch.index_select(0, sample_is)
        target_batch = target_batch.index_select(0, sample_is)
        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(0.1)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    #self.semaphore.acquire()
                    self.copy_weights()
                    #self.semaphore.release()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True
            if i == 50:
                pretrain = False
Пример #9
0
class Evaluator(multiprocessing.Process):
    def __init__(self, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 50
        self.TRANSFER = 50
        self.BATCH_SIZE = 32
        self.GAMMA = 0.99
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.

        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.RMSprop(self.net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):

        batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*batch))
        #state_batch = torch.from_numpy(np.concatenate(list(unzipped[0])))
        #state_batch = Variable(state_batch)
        #action_batch = torch.from_numpy(np.concatenate(list(unzipped[1])))
        #action_batch = Variable(action_batch)
        #reward_batch = torch.from_numpy(np.concatenate(list(unzipped[2])))
        #reward_batch = Variable(reward_batch, requires_grad=False)
        state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        reward_batch = Variable(torch.cat(list(unzipped[2])).clone(),
                                requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            #term_batch = torch.from_numpy(np.concatenate(list(unzipped[5])))
            #term_batch = Variable(term_batch, volatile=True)
            term_batch = Variable(torch.cat(list(unzipped[5])).clone(),
                                  volatile=True)
            #next_action_batch = torch.from_numpy(np.concatenate(list(unzipped[4])))
            #next_action_batch = Variable(next_action_batch, volatile=True)
            next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(),
                                         volatile=True)
            #next_state = torch.from_numpy(np.concatenate(list(unzipped[3])))
            #next_state = Variable(next_state, volatile=True)
            #next_state_values = self.targetNet(next_state).gather(1, next_action_batch)
            next_state_values = self.targetNet.evaluate(list(
                unzipped[3])).gather(1, next_action_batch)
            #non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, list(unzipped[3]))))
            #non_final_next_states = Variable(torch.cat([s for s in list(unzipped[3]) if s is not None]),
            #                                 volatile=True)
            #next_state_values = Variable(torch.zeros(self.BATCH_SIZE).type(Tensor))
            #next_state_values[non_final_mask] = self.targetNet(non_final_next_states).gather(1, next_action_batch)
            #next_state_values.volatile = False
            #print(next_state_values)
            #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1)
            next_state_values = term_batch * next_state_values
            next_state_values.volatile = False
            #next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        pretrain = True
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping... size: {}'.format(len(
                    self.shared['memory'])))
                time.sleep(0.1)
            print('training...')
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                self.semaphore.acquire()
                memory = copy.deepcopy(self.shared['memory'])
                self.semaphore.release()
                if len(memory) < self.BATCH_SIZE:
                    continue

                batch_tuple = self.minibatch(memory, pretrain)
                #print('got batch tuple')
                #print(batch_tuple[0].type)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False
Пример #10
0
from Env.Environment import Environment
from DQN.ReplayMemory import ReplayMemory
import torch.nn as nn
import torch.optim as optim
from Env.gymEnv import myGym
import time
import numpy as np
import cv2
import matplotlib
import matplotlib.pyplot as plt
from multiprocessing import Manager, Event
from multiprocessing.managers import SyncManager
# hyperparameters
MEMORY_SIZE = 100

imp_net = DQN()
#eval_net = DQN()
#eval_target_net = DQN()
#eval_net.setOptimizer(optim.RMSprop(eval_net.parameters(), lr=LEARNING_RATE,
#                                    momentum=MOMENTUM, alpha=SQUARED_MOMENTUM,
#                                    eps=MIN_SQUARED_GRAD))
imp_net.share_memory()
#eval_net.share_memory()
#eval_target_net.share_memory()
#env = Environment()
#env = myGym()

# populate memory
# let improver populate first
SyncManager.register('ReplayMemory',
                     ReplayMemory,
Пример #11
0
from multiprocessing.managers import SyncManager
import torch.nn as nn
import time
from DQN.DQNcartpole import DQN
from Env.gymEnv import myGym
from DQN.CartPoleDQN import CartPoleDQN
from DQN.ReplayMemory import ReplayMemory

if __name__ == '__main__':
    demonet = DQN()
    #manager = SyncManager()
    #manager.start()
    memory = ReplayMemory(10000)
    #for i in range(memory.capacity):
    #    memory.push(torch.FloatTensor(1, 3, 40, 80))
    shared = dict({'memory': memory, 'SENT_FLAG': True, 'weights': None})
    p = CartPoleDQN(DQN(), shared, myGym())
    p.run()
Пример #12
0
class MyProcess(mp.Process):
    def __init__(self, inputs):
        mp.Process.__init__(self)
        self.BATCH_SIZE = 32
        self.TRAIN_MAX = 500
        self.TRANSFER = 100
        self.GAMMA = 1.0
        LEARNING_RATE = 0.00025
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01
        self.demonet = DQN()
        self.targetnet = DQN()
        self.copy_weights()
        self.demonet.setOptimizer(
            optim.RMSprop(self.demonet.parameters(),
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          alpha=SQUARED_MOMENTUM,
                          eps=MIN_SQUARED_GRAD))
        self.inputs = inputs
        #self.demonet.setOptimizer(optim.Adam(params=self.demonet.parameters()))
    def copy_weights(self):
        self.targetnet.load_state_dict(self.demonet.state_dict())

    def minibatch(self, exp_replay, pretrain=False):
        batch = exp_replay.sample(self.BATCH_SIZE)
        unzipped = list(zip(*batch))
        #state_batch = np.concatenate(list(unzipped[0]))
        #state_batch = Variable(torch.from_numpy(state_batch))
        #action_batch = np.concatenate(list(unzipped[1]))
        #action_batch = Variable(torch.from_numpy(action_batch))
        #reward_batch = np.concatenate(list(unzipped[2]))
        #reward_batch = Variable(torch.from_numpy(reward_batch), requires_grad=False)
        state_batch = Variable(torch.cat(list(unzipped[0])).clone())
        action_batch = Variable(torch.cat(list(unzipped[1])).clone())
        reward_batch = Variable(torch.cat(list(unzipped[2])).clone(),
                                requires_grad=False)

        if pretrain:
            # only use reward
            return state_batch, action_batch, reward_batch
        else:
            #term_batch = np.concatenate(list(unzipped[5]))
            #term_batch = Variable(torch.from_numpy(term_batch), volatile=True)
            #next_action_batch = np.concatenate(list(unzipped[4]))
            #next_action_batch = Variable(torch.from_numpy(next_action_batch), volatile=True)
            #next_state_batch = np.concatenate(list(unzipped[3]))
            #next_state_batch = Variable(torch.from_numpy(next_state_batch), volatile=True)
            #next_state_values = self.targetNet(next_state_batch).gather(1,next_action_batch)
            term_batch = Variable(torch.cat(list(unzipped[5]).clone()),
                                  volatile=True)
            next_action_batch = Variable(torch.cat(list(unzipped[4]).clone()),
                                         volatile=True)
            next_state_values = self.targetNet.evaluate(
                list(unzipped[3]).clone()).gather(1, next_action_batch)
            next_state_values = term_batch * next_state_values
            print(next_state_values)
            next_state_values.volatile = False
            next_state_values.requires_grad = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
            return state_batch, action_batch, target_batch

    def run(self):
        pretrain = True
        while True:
            while self.inputs['SENT_FLAG']:
                print('sleeping... size: {}'.format(len(
                    self.inputs['inputs'])))
                time.sleep(1)
            for step_i in range(self.TRAIN_MAX):
                sample = self.minibatch(self.inputs['inputs'], pretrain)
                self.demonet(sample[0])
                print('hello world')
                loss = self.demonet.optimize(sample)
                #time.sleep(1)
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True