Exemplo n.º 1
0
    def __init__(self, g, k, s, c, h_g, h_l, std, hidden_size, num_classes):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - k: number of patches to extract per glimpse.
        - s: scaling factor that controls the size of successive patches.
        - c: number of channels in each image.
        - h_g: hidden layer size of the fc layer for `phi`.
        - h_l: hidden layer size of the fc layer for `l`.
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentAttention, self).__init__()
        self.std = std

        self.sensor = glimpse_network(h_g, h_l, g, k, s, c)
        self.rnn = core_network(hidden_size, hidden_size)
        self.locator = location_network(hidden_size, 2, std)
        self.classifier = action_network(hidden_size, num_classes)
        self.baseliner = baseline_network(hidden_size, 1)
Exemplo n.º 2
0
def test_glimpse():
    config, unparsed = get_config()
    train_loader, _ = get_train_valid_loader(config.data_dir,
                                             config.batch_size,
                                             config.random_seed,
                                             config.valid_size, config.shuffle,
                                             config.show_sample)

    img, label = next(iter(train_loader))

    r = retina(g=8, k=1, s=1)
    g = glimpse_network(block=BasicBlock,
                        h_g=128,
                        h_l=32,
                        h_s=128,
                        g=8,
                        k=1,
                        s=1,
                        c=3)

    l_t_prev = torch.rand(config.batch_size, 2).uniform_(-1, 1)
    size_t_prev = torch.rand(config.batch_size, 5).uniform_(0, 1)

    #print(g.feature_extractors["size_32"])
    g_t = g(img, l_t_prev, size_t_prev)

    return g_t
Exemplo n.º 3
0
    def __init__(self):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - k: number of patches to extract per glimpse. (Zoom Values)
        - s: scaling factor that controls the size of successive patches.
        - c: number of channels in each image.
        - h_g: hidden layer size of the fc layer for `phi`.
        - h_l: hidden layer size of the fc layer for `l`.
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentSpatialTransformer, self).__init__()

        # Crop the image using differential STN at location and zoom
        self.stn = stn_zoom(out_height=26, out_width=26)

        # In glimpse Network we convert the cropped image in to feature sapce
        # using CNN and FC Layer then concatenate it with the previous location
        self.sensor = glimpse_network(glimpse_size = 26, h_g = 1024, h_l = 1024, c = 1)
        self.rnn1 = nn.GRUCell(2048, 512)
        self.fc_rnn = nn.Linear(512, 2048)
        self.rnn2 = nn.GRUCell(2048, 512)
        self.classifier = classification_network(input_size = 512, hidden_size = 1024, num_classes = 10)
        self.locator = location_network(input_size = 512, hidden_size = 1024)
        self.context = context_network(100,512)
def main():

    # load images
    imgs = []
    paths = [data_dir + './lenna.jpg', data_dir + './cat.jpg']
    for i in range(len(paths)):
        img = img2array(paths[i], desired_size=[512, 512], expand=True)
        imgs.append(torch.from_numpy(img))
    imgs = torch.cat(imgs)

    B, H, W, C = imgs.shape

    loc = torch.Tensor([[-1., 1.], [-1., 1.]])
    imgs, loc = Variable(imgs), Variable(loc)
    sensor = glimpse_network(h_g=128, h_l=128, g=64, k=3, s=2, c=3)
    g_t = sensor(imgs, loc)

    rnn = core_network(input_size=256, hidden_size=256)
    h_t = Variable(torch.zeros(g_t.shape[0], 256))
    h_t = rnn(g_t, h_t)

    classifier = action_network(256, 10)
    a_t = classifier(h_t)

    loc_net = location_network(256, 2, 0.11)
    mu, l_t = loc_net(h_t)

    base = baseline_network(256, 1)
    b_t = base(h_t)

    print("g_t: {}".format(g_t.shape))
    print("h_t: {}".format(h_t.shape))
    print("l_t: {}".format(l_t.shape))
    print("a_t: {}".format(a_t.shape))
    print("b_t: {}".format(b_t.shape))
    def __init__(self):

        super(RecurrentSpatialTransformer, self).__init__()

        self.context = context_network(100, 512)
        self.rnn = nn.GRU(512, 512, 1)
        self.locator = location_network(input_size=512, hidden_size=1024)

        # Crop the image using differential STN at location and zoom
        self.stn = stn_zoom(out_height=26, out_width=26)
        self.sensor = glimpse_network(glimpse_size=26, h_g=512, h_l=1024, c=1)
        self.classifier = classification_network(input_size=512,
                                                 hidden_size=1024,
                                                 num_classes=10)
        self.context_2 = context_network_2(12, 1024)
Exemplo n.º 6
0
    def __init__(self,
                 g,
                 k,
                 s,
                 c,
                 h_g,
                 h_l,
                 std,
                 hidden_size,
                 num_classes):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - k: number of patches to extract per glimpse.
        - s: scaling factor that controls the size of successive patches.
        - c: number of channels in each image.
        - h_g: hidden layer size of the fc layer for `phi`.
        - h_l: hidden layer size of the fc layer for `l`.
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentAttention, self).__init__()
        self.std = std

        # feature extraction on x at location l_t_prev, and combine the information
        # of the image patches and their locations
        self.sensor = glimpse_network(h_g, h_l, g, k, s, c)
        # combine the information of current patch_info g_t and the hidden info from the last step h_t
        self.rnn = core_network(hidden_size, hidden_size)
        # Uses the internal state `h_t` of the core network to produce 
        # the location coordinates `l_t` for the next time step.
        # only take the new h_t as input, without the old l_t_prev
        self.locator = location_network(hidden_size, 2, std)
        self.classifier = action_network(hidden_size, num_classes)
        self.baseliner = baseline_network(hidden_size, 1)
Exemplo n.º 7
0
    def __init__(self,
                 g,
                 k,
                 s,
                 c,
                 h_g,
                 h_l,
                 std,
                 hidden_size,
                 num_classes, 
                 kernel_size, 
                 num_stacks, 
                 stack_attn_mode):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - k: number of patches to extract per glimpse.
        - s: scaling factor that controls the size of successive patches.
        - c: number of channels in each image.
        - h_g: hidden layer size of the fc layer for `phi`.
        - h_l: hidden layer size of the fc layer for `l`.
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        - kernel_size: list of int, convolutional kernel size in stacked RAM
        - num_stacks: int, number of layers in stacked RAM
        - stack_attn_mode: str, values chosen from 'separate', 'concat', 'combine'
        """
        super(RecurrentAttention, self).__init__()
        self.std = std
        self.num_stacks = num_stacks
        self.stack_attn_mode = stack_attn_mode

        self.sensor = nn.ModuleList([
                glimpse_network(h_g, h_l, g, k, s, c, kernel_size)
                for _ in range(num_stacks)
                ])
        self.rnn = nn.ModuleList([
                core_network(h_g + h_l, hidden_size)
                for _ in range(num_stacks)
                ])
        if stack_attn_mode == 'separate':
            self.locator = nn.ModuleList([
                location_network(hidden_size, 2, std)
                for _ in range(num_stacks)
                ])
        elif stack_attn_mode == 'concat':
            self.locator = location_network(hidden_size * num_stacks, 2, std)
        elif stack_attn_mode == 'combine':
            self.locator = location_network(hidden_size * num_stacks, 2 * num_stacks, std)
        else:
            raise 'Unknown stack_attn_mode [%s]' % stack_attn_mode

        self.baseliner = nn.ModuleList([
            baseline_network(hidden_size, 1) for _ in range(num_stacks)
            ])
        self.classifier = action_network(hidden_size * num_stacks, num_classes)
Exemplo n.º 8
0
    def __init__(self, g, c, image_size, std, hidden_size, num_classes,
                 config):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - c: number of channels in each image.   
        - image_size: a tuple: (H x W)     
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentAttention, self).__init__()

        # when the locations l is defined by a Gaussian distribution
        self.std = std

        # when the locations l is defined by a symmetry stable distribution
        self.alpha = config.alpha
        self.gamma = config.gamma

        self.config = config

        self.context = context_network(c, config.kernel_size, hidden_size)
        self.sensor = glimpse_network(hidden_size, g, c, config)
        self.rnn = core_network(hidden_size, hidden_size, config)
        self.top_down_locator = location_network(hidden_size, 2, config)
        self.bot_up_locator = Levy_bottom_up_generator(config.batch_size,
                                                       image_size, config)
        self.combine_location = combine_location_network(hidden_size, config)
        self.classifier = action_network(hidden_size, num_classes)
        self.baseliner = baseline_network(hidden_size, 1)

        # something for initialzing subroutine
        dtype = (torch.cuda.FloatTensor
                 if self.config.use_gpu else torch.FloatTensor)

        # derivative of Saliecy map
        self.derivative_y = torch.tensor([-1, 0, 1]).reshape(1, 1, 3,
                                                             1).type(dtype)
        self.derivative_x = torch.t(torch.tensor([-1, 0,
                                                  1])).reshape(1, 1, 1,
                                                               3).type(dtype)
        # a weighted saliency s gauged at a fixation center
        self.gaussian_kernel_sigma = math.floor(
            image_size[0] /
            12)  # in the paper, /6 but pytorch does not accept such big kernel
        gaussian_kernel_size = self.gaussian_kernel_sigma * 2 + 1
        tmp_x, tmp_y = torch.meshgrid(
            torch.arange(-self.gaussian_kernel_sigma,
                         self.gaussian_kernel_sigma + 1).type(dtype),
            torch.arange(-self.gaussian_kernel_sigma,
                         self.gaussian_kernel_sigma + 1).type(dtype))
        self.gaussian_kernel = torch.exp(
            -(tmp_x.type(dtype)**2 + tmp_y.type(dtype)**2) /
            self.gaussian_kernel_sigma**2).reshape(1, 1, gaussian_kernel_size,
                                                   gaussian_kernel_size)