Пример #1
0
    def __init__(self, g, k, s, c, h_g, h_l, std, hidden_size, num_classes):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - k: number of patches to extract per glimpse.
        - s: scaling factor that controls the size of successive patches.
        - c: number of channels in each image.
        - h_g: hidden layer size of the fc layer for `phi`.
        - h_l: hidden layer size of the fc layer for `l`.
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentAttention, self).__init__()
        self.std = std

        self.sensor = glimpse_network(h_g, h_l, g, k, s, c)
        self.rnn = core_network(hidden_size, hidden_size)
        self.locator = location_network(hidden_size, 2, std)
        self.classifier = action_network(hidden_size, num_classes)
        self.baseliner = baseline_network(hidden_size, 1)
    def __init__(self, g, h_g, h_l, std, hidden_size, num_classes):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - h_g: hidden layer size of the fc layer for 'what' representation
        - h_l: hidden layer size of the fc layer for 'where' representation
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the LSTM
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentAttention, self).__init__()
        self.std = std
        self.ret = retina(g, k, s)

        self.sensor = glimpse_3d(h_g, h_l, g, k, s, c)
        self.rnn = core_network(hidden_size, hidden_size)
        self.locator = location_network(hidden_size, 3, std)
        self.classifier = action_network(hidden_size, num_classes)
        self.baseliner = baseline_network(hidden_size, 1)
        self.context = context_network_clin(hidden_size)
def main():

    # load images
    imgs = []
    paths = [data_dir + './lenna.jpg', data_dir + './cat.jpg']
    for i in range(len(paths)):
        img = img2array(paths[i], desired_size=[512, 512], expand=True)
        imgs.append(torch.from_numpy(img))
    imgs = torch.cat(imgs)

    B, H, W, C = imgs.shape

    loc = torch.Tensor([[-1., 1.], [-1., 1.]])
    imgs, loc = Variable(imgs), Variable(loc)
    sensor = glimpse_network(h_g=128, h_l=128, g=64, k=3, s=2, c=3)
    g_t = sensor(imgs, loc)

    rnn = core_network(input_size=256, hidden_size=256)
    h_t = Variable(torch.zeros(g_t.shape[0], 256))
    h_t = rnn(g_t, h_t)

    classifier = action_network(256, 10)
    a_t = classifier(h_t)

    loc_net = location_network(256, 2, 0.11)
    mu, l_t = loc_net(h_t)

    base = baseline_network(256, 1)
    b_t = base(h_t)

    print("g_t: {}".format(g_t.shape))
    print("h_t: {}".format(h_t.shape))
    print("l_t: {}".format(l_t.shape))
    print("a_t: {}".format(a_t.shape))
    print("b_t: {}".format(b_t.shape))
Пример #4
0
    def __init__(self,
                 g,
                 k,
                 s,
                 c,
                 h_g,
                 h_l,
                 std,
                 hidden_size,
                 num_classes):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - k: number of patches to extract per glimpse.
        - s: scaling factor that controls the size of successive patches.
        - c: number of channels in each image.
        - h_g: hidden layer size of the fc layer for `phi`.
        - h_l: hidden layer size of the fc layer for `l`.
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentAttention, self).__init__()
        self.std = std

        # feature extraction on x at location l_t_prev, and combine the information
        # of the image patches and their locations
        self.sensor = glimpse_network(h_g, h_l, g, k, s, c)
        # combine the information of current patch_info g_t and the hidden info from the last step h_t
        self.rnn = core_network(hidden_size, hidden_size)
        # Uses the internal state `h_t` of the core network to produce 
        # the location coordinates `l_t` for the next time step.
        # only take the new h_t as input, without the old l_t_prev
        self.locator = location_network(hidden_size, 2, std)
        self.classifier = action_network(hidden_size, num_classes)
        self.baseliner = baseline_network(hidden_size, 1)
Пример #5
0
    def __init__(self,
                 g,
                 k,
                 s,
                 c,
                 h_g,
                 h_l,
                 std,
                 hidden_size,
                 num_classes, 
                 kernel_size, 
                 num_stacks, 
                 stack_attn_mode):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - k: number of patches to extract per glimpse.
        - s: scaling factor that controls the size of successive patches.
        - c: number of channels in each image.
        - h_g: hidden layer size of the fc layer for `phi`.
        - h_l: hidden layer size of the fc layer for `l`.
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        - kernel_size: list of int, convolutional kernel size in stacked RAM
        - num_stacks: int, number of layers in stacked RAM
        - stack_attn_mode: str, values chosen from 'separate', 'concat', 'combine'
        """
        super(RecurrentAttention, self).__init__()
        self.std = std
        self.num_stacks = num_stacks
        self.stack_attn_mode = stack_attn_mode

        self.sensor = nn.ModuleList([
                glimpse_network(h_g, h_l, g, k, s, c, kernel_size)
                for _ in range(num_stacks)
                ])
        self.rnn = nn.ModuleList([
                core_network(h_g + h_l, hidden_size)
                for _ in range(num_stacks)
                ])
        if stack_attn_mode == 'separate':
            self.locator = nn.ModuleList([
                location_network(hidden_size, 2, std)
                for _ in range(num_stacks)
                ])
        elif stack_attn_mode == 'concat':
            self.locator = location_network(hidden_size * num_stacks, 2, std)
        elif stack_attn_mode == 'combine':
            self.locator = location_network(hidden_size * num_stacks, 2 * num_stacks, std)
        else:
            raise 'Unknown stack_attn_mode [%s]' % stack_attn_mode

        self.baseliner = nn.ModuleList([
            baseline_network(hidden_size, 1) for _ in range(num_stacks)
            ])
        self.classifier = action_network(hidden_size * num_stacks, num_classes)
Пример #6
0
    def __init__(self, g, c, image_size, std, hidden_size, num_classes,
                 config):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - c: number of channels in each image.   
        - image_size: a tuple: (H x W)     
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentAttention, self).__init__()

        # when the locations l is defined by a Gaussian distribution
        self.std = std

        # when the locations l is defined by a symmetry stable distribution
        self.alpha = config.alpha
        self.gamma = config.gamma

        self.config = config

        self.context = context_network(c, config.kernel_size, hidden_size)
        self.sensor = glimpse_network(hidden_size, g, c, config)
        self.rnn = core_network(hidden_size, hidden_size, config)
        self.top_down_locator = location_network(hidden_size, 2, config)
        self.bot_up_locator = Levy_bottom_up_generator(config.batch_size,
                                                       image_size, config)
        self.combine_location = combine_location_network(hidden_size, config)
        self.classifier = action_network(hidden_size, num_classes)
        self.baseliner = baseline_network(hidden_size, 1)

        # something for initialzing subroutine
        dtype = (torch.cuda.FloatTensor
                 if self.config.use_gpu else torch.FloatTensor)

        # derivative of Saliecy map
        self.derivative_y = torch.tensor([-1, 0, 1]).reshape(1, 1, 3,
                                                             1).type(dtype)
        self.derivative_x = torch.t(torch.tensor([-1, 0,
                                                  1])).reshape(1, 1, 1,
                                                               3).type(dtype)
        # a weighted saliency s gauged at a fixation center
        self.gaussian_kernel_sigma = math.floor(
            image_size[0] /
            12)  # in the paper, /6 but pytorch does not accept such big kernel
        gaussian_kernel_size = self.gaussian_kernel_sigma * 2 + 1
        tmp_x, tmp_y = torch.meshgrid(
            torch.arange(-self.gaussian_kernel_sigma,
                         self.gaussian_kernel_sigma + 1).type(dtype),
            torch.arange(-self.gaussian_kernel_sigma,
                         self.gaussian_kernel_sigma + 1).type(dtype))
        self.gaussian_kernel = torch.exp(
            -(tmp_x.type(dtype)**2 + tmp_y.type(dtype)**2) /
            self.gaussian_kernel_sigma**2).reshape(1, 1, gaussian_kernel_size,
                                                   gaussian_kernel_size)