Пример #1
0
    def run(self):
        # shell = self.ItqShell(self.r, self.m_vec)

        packet = self.in_q.get()
        d_elems = []
        while packet:
            # self._log.debug("[%s] Packet: %s", self.name, packet)
            descr_elem = packet
            # self.out_q.put((shell.get_small_code(descr_elem),
            #                 descr_elem))

            d_elems.append(descr_elem)
            if len(d_elems) >= self.batch:
                self._log.debug("[%s] Computing batch of %d", self.name, len(d_elems))
                m = np.array([d.vector() for d in d_elems])
                z = np.dot((m - self.m_vec), self.r)
                b = np.zeros(z.shape, dtype=np.uint8)
                b[z >= 0] = 1
                for bits, d in zip(b, d_elems):
                    self.out_q.put((bit_vector_to_int(bits), d))
                d_elems = []

            packet = self.in_q.get()

        if d_elems:
            self._log.debug("[%s] Computing batch of %d", self.name, len(d_elems))
            m = np.array([d.vector() for d in d_elems])
            z = np.dot((m - self.m_vec), self.r)
            b = np.zeros(z.shape, dtype=np.uint8)
            b[z >= 0] = 1
            for bits, d in zip(b, d_elems):
                self.out_q.put((bit_vector_to_int(bits), d))
            d_elems = []
Пример #2
0
    def get_small_code(self, descriptor):
        """
        Generate the small-code for the given descriptor.

        This only works if we have an index loaded, meaning we have a rotation
        matrix.

        :param descriptor: Descriptor to generate the small code for.
        :type descriptor: smqtk.representation.DescriptorElement

        :return: The descriptor's vector, the n-bit vector, and the compacted
            N-bit small-code as an integer.
        :rtype: numpy.core.multiarray.ndarray[float],
                numpy.core.multiarray.ndarray[numpy.uint8],
                int

        """
        v = descriptor.vector()
        z = numpy.dot(v - self._mean_vector, self._r)
        b = numpy.zeros(z.shape, dtype=numpy.uint8)
        b[z >= 0] = 1
        return v, b, bit_utils.bit_vector_to_int(b)
Пример #3
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptor data elements.

        The first part of this method is equivalent to the compressITQ function
        from UNC-CH's implementation.

        :raises RuntimeError: A current data model is loaded, or the current
            CodeIndex is not empty.
        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptor elements to build index over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # Halt if we are going to overwrite a loaded mean/rotation cache.
        if not (self._mean_vector is None and self._r is None):
            raise RuntimeError("Current ITQ model is not empty (cached mean / "
                               "rotation). For the sake of protecting data, we "
                               "are not proceeding.")
        # Halt if the code index currently isn't empty
        if self.count():
            raise RuntimeError("Current CodeIndex instance is not empty. For "
                               "the sake of protecting data, we are not "
                               "proceeding.")

        self._log.debug("Using %d length bit-vectors", self._bit_len)

        # TODO: Sub-sample down descriptors to use for PCA + ITQ
        #       - Harry was also working on an iterative training approach so
        #           that we only have to have a limited number of vectors in
        #           memory at a time.
        if self._rand_seed:
            numpy.random.seed(self._rand_seed)

        with SimpleTimer("Creating descriptor cache", self._log.info):
            #: :type: list[smqtk.representation.DescriptorElement]
            descr_cache = []
            for d in descriptors:
                descr_cache.append(d)
            if not descr_cache:
                raise ValueError("No descriptors given!")
        with SimpleTimer("Creating matrix of descriptors for training",
                         self._log.info):
            # Get non-memory vectors on separate processes and aggregate into
            # matrix.
            self._log.debug("Input elements: %d", len(descr_cache))
            self._log.debug("Input elem size: %s", descr_cache[0].vector().size)
            dbg_report_interval = None
            if self.logger().getEffectiveLevel() <= logging.DEBUG:
                dbg_report_interval = 1.0  # seconds
            x = elements_to_matrix(descr_cache,
                                   report_interval=dbg_report_interval)
            self._log.debug("descriptor matrix shape: %s", x.shape)

        with SimpleTimer("Centering data", self._log.info):
            # center the data, VERY IMPORTANT for ITQ to work
            self._mean_vector = numpy.mean(x, axis=0)
            x -= self._mean_vector
        if self._mean_vec_cache_filepath:
            with SimpleTimer("Saving mean vector", self._log.info):
                file_utils.safe_create_dir(osp.dirname(self._mean_vec_cache_filepath))
                numpy.save(self._mean_vec_cache_filepath, self._mean_vector)

        # PCA
        with SimpleTimer("Computing PCA transformation", self._log.info):
            # numpy and matlab observation format is flipped, thus added
            # transpose
            self._log.debug("-- computing covariance")
            c = numpy.cov(x.transpose())

            # Direct translation
            # - eigen vectors are the columns of ``pc``
            self._log.debug('-- computing linalg.eig')
            l, pc = numpy.linalg.eig(c)
            # ordered by greatest eigenvalue magnitude, keeping top ``bit_len``
            self._log.debug('-- computing top pairs')
            top_pairs = sorted(zip(l, pc.transpose()),
                               key=lambda p: p[0],
                               reverse=1
                               )[:self._bit_len]

            # # Harry translation -- Uses singular values / vectors, not eigen
            # # - singular vectors are the rows of pc
            # pc, l, _ = numpy.linalg.svd(c)
            # top_pairs = sorted(zip(l, pc),
            #                    key=lambda p: p[0],
            #                    reverse=1
            #                    )[:self._bit_len]

            # Eigen-vectors of top ``bit_len`` magnitude eigenvalues
            self._log.debug("-- top vector extraction")
            pc_top = numpy.array([p[1] for p in top_pairs]).transpose()
            self._log.debug("-- transform centered data by PC matrix")
            xx = numpy.dot(x, pc_top)

        # ITQ to find optimal rotation.
        #   `c` is the output codes for matrix `x`
        #   `r` is the rotation found by ITQ
        with SimpleTimer("Performing ITQ to find optimal rotation",
                         self._log.info):
            c, self._r = self._find_itq_rotation(xx, self._itq_iter_num)
            # De-adjust rotation with PC vector
            self._r = numpy.dot(pc_top, self._r)
        if self._rotation_cache_filepath:
            with SimpleTimer("Saving rotation matrix", self._log.info):
                file_utils.safe_create_dir(osp.dirname(self._rotation_cache_filepath))
                numpy.save(self._rotation_cache_filepath, self._r)

        # Populating small-code index
        #   - Converting bit-vectors proved faster than creating new codes over
        #       again (~0.01s vs ~0.04s for 80 vectors).
        with SimpleTimer("Clearing code index", self._log.info):
            self._code_index.clear()
        with SimpleTimer("Converting bit-vectors into small codes, inserting "
                         "into code index", self._log.info):
            self._code_index.add_many_descriptors(
                (bit_utils.bit_vector_to_int(c[i]), descr_cache[i])
                for i in xrange(c.shape[0])
            )
Пример #4
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptor data elements.

        Subsequent calls to this method should rebuild the index, not add to it.

        The first part of this method is equivalent to the compressITQ function
        from UNC-CH's implementation.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptor elements to build index over.
        :type descriptors: collections.Iterable[smqtk.data_rep.DescriptorElement]

        """
        # TODO: Sub-sample down descriptors to use for PCA + ITQ
        #       - Harry was also working on an iterative training approach so
        #           that we only have to have a limited number of vectors in
        #           memory at a time.
        if self._rand_seed:
            numpy.random.seed(self._rand_seed)

        with SimpleTimer("Creating descriptor matrix", self._log.info):
            x = []
            #: :type: list[smqtk.data_rep.DescriptorElement]
            descr_cache = []
            for d in descriptors:
                descr_cache.append(d)
                x.append(d.vector())
            if not x:
                raise ValueError("No descriptors given!")
            x = numpy.array(x)

        with SimpleTimer("Centering data", self._log.info):
            # center the data, VERY IMPORTANT for ITQ to work
            self._mean_vector = numpy.mean(x, axis=0)
            # x = x - numpy.matlib.repmat(self._mean_vector, x.shape[0], 1)
            x -= self._mean_vector

        # PCA
        with SimpleTimer("Computing PCA transformation", self._log.info):
            # numpy and matlab observation format is flipped, thus added
            # transpose
            c = numpy.cov(x.transpose())

            # Direct translation
            l, pc = numpy.linalg.eig(c)
            # ordered by greatest eigenvalue magnitude, keeping top ``bit_len``
            top_pairs = sorted(zip(l, pc.transpose()),
                               key=lambda p: p[0],
                               reverse=1
                               )[:self._bit_len]

            # # Harry translation -- Uses singluar values / vectors, not eigen
            # pc, l, _ = numpy.linalg.svd(c)
            # top_pairs = sorted(zip(l, pc),
            #                    key=lambda p: p[0],
            #                    reverse=1
            #                    )[:self._bit_len]

            # Eigenvectors of top ``bit_len`` magnitude eigenvalues
            pc_top = numpy.array([p[1] for p in top_pairs]).transpose()
            xx = numpy.dot(x, pc_top)

        # ITQ to find optimal rotation.
        #   `c` is the output codes for matrix `x`
        #   `r` is the rotation found by ITQ
        with SimpleTimer("Performing ITQ to find optimal rotation",
                         self._log.info):
            c, self._r = self._find_itq_rotation(xx, self._itq_iter_num)
            # De-adjust rotation with PC vector
            self._r = numpy.dot(pc_top, self._r)

        # Populating small-code hash-table
        #   - Converting bit-vectors proved faster than creating new codes over
        #       again (~0.01s vs ~0.04s for 80 vectors).
        with SimpleTimer("Converting bitvectors into small codes",
                         self._log.info):
            # props = rtree.index.Property()
            # props.dimension = self._bit_len
            # # Interleaved so we can just specify coord as the bit-vector
            # # concatenated with itself.
            # #: :type: rtree.index.Rtree
            # self._code_rt = rtree.index.Rtree(interleaved=True,
            #                                   properties=props)

            for code_vec, descr in zip(c, descr_cache):
                packed = bit_utils.bit_vector_to_int(code_vec)
                self._code_index.add_descriptor(packed, descr)