Exemplo n.º 1
0
    def update(self, rbs):
        ## backprop after episode

        ## if not episode end, use newwork to estimate value of last state, else 0
        obs = rbs[-1][-1]
        if obs.last():
            R = 0
        else:
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(PP.preprocess_screen(screen), axis=0)
            feed = {self.screen: screen}
            R = self.sess.run(self.value, feed_dict=feed)[0]

        ## prepare input & actions & Q value target
        screens = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, _] in enumerate(rbs):
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(PP.preprocess_screen(screen), axis=0)
            screens.append(screen)

            reward = obs.reward
            act_id = action.function
            act_args = action.arguments

            value_target[i] = reward + self.discount * value_target[i - 1]

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen'):
                    idx = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, idx] = 1

        screens = np.concatenate(screens, axis=0)

        ## backprop
        feed = {
            self.screen: screens,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.learning_rate: self.lr
        }
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)
        self.summary_writer.add_summary(summary, self.cur_episode)
Exemplo n.º 2
0
    def step(self, obs):
        ## feed to network
        minimap = np.array(obs.observation['feature_minimap'],
                           dtype=np.float32)
        minimap = np.expand_dims(PP.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(PP.preprocess_screen(screen), axis=0)
        structure = np.expand_dims(PP.preprocess_structure(obs), axis=0)

        feed = {
            self.minimap: minimap,
            self.screen: screen,
            self.structure: structure
        }
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        ## choose spatial and non-spatial action
        non_spatial_action = non_spatial_action.ravel()
        valid_actions = obs.observation['available_actions']
        temp = []
        for i in [0, 7, 12]:
            if i in valid_actions:
                temp.append(i)
        valid_actions = temp
        for i in [0, 7, 12]:
            if i in valid_actions:
                temp.append(i)
        valid_actions = temp
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]

        spatial_action = spatial_action.ravel()
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        ## epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            range = int(self.random_range)
            dy = np.random.randint(-range, range)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-range, range)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        ## return function
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):  ## spatial arg
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  ## non-spatial arg

        return actions.FunctionCall(act_id, act_args)
Exemplo n.º 3
0
    def step(self, obs):
        ## feed to network
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(PP.preprocess_screen(screen), axis=0)
        feed = {self.screen: screen}
        spatial_action = self.sess.run([self.spatial_action],
                                       feed_dict=feed)[0]

        ## choose spatial action
        spatial_action = spatial_action.ravel()
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]
        # print(target, end=' ')
        # print(obs.observation['feature_screen'][5, target[1], target[0]] == 3, end=' ')

        ## epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[1]:
            range = int(self.random_range)
            dy = np.random.randint(-range, range)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-range, range)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        if 490 in obs.observation.available_actions:
            # print('train scv')
            return actions.FunctionCall(490, [[0]])
        if obs.observation['player'][3] < obs.observation['player'][
                4] and obs.observation['player'][1] > 50:
            return actions.FunctionCall(2, [[0], [25, 25]])
        if not self.walked and len(
                obs.observation['multi_select']
        ) > 0 and 264 in obs.observation.available_actions:
            # print('harvest')
            # print(target)
            self.walked = True
            return actions.FunctionCall(264, [[0], target])
        if 0 in obs.observation.available_actions and obs.observation[
                'player'][-4] == 0:
            # print('noop')
            return actions.FunctionCall(0, [])
        if 6 in obs.observation.available_actions and obs.observation[
                'player'][-4] > 0:
            # print('select')
            self.walked = False
            return actions.FunctionCall(6, [[1]])

        # print('noop')
        return actions.FunctionCall(0, [])
Exemplo n.º 4
0
    def update(self, rbs):
        ## backprop after episode

        ## if not episode end, use newwork to estimate value of last state, else 0
        obs = rbs[-1][-1]
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(PP.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(PP.preprocess_screen(screen), axis=0)
            structure = np.expand_dims(PP.preprocess_structure(obs), axis=0)
            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.structure: structure
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        ## prepare input & actions & Q value target
        minimaps = []
        screens = []
        structures = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros([len(rbs), self.action_size],
                                            dtype=np.float32)
        non_spatial_action_selected = np.zeros([len(rbs), self.action_size],
                                               dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, _] in enumerate(rbs):
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(PP.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(PP.preprocess_screen(screen), axis=0)
            structure = np.expand_dims(PP.preprocess_structure(obs), axis=0)

            minimaps.append(minimap)
            screens.append(screen)
            structures.append(structure)

            reward = obs.reward * (obs.observation["score_cumulative"][0] +
                                   1) * 10  # 10 times reward
            act_id = action.function
            act_args = action.arguments

            value_target[i] = reward + self.discount * value_target[i - 1]

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    idx = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, idx] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        structures = np.concatenate(structures, axis=0)

        ## backprop
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.structure: structures,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: self.lr
        }
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)
        self.summary_writer.add_summary(summary, self.cur_episode)