예제 #1
0
    def _write_file_chunks(self, chunk_map, file_extension=""):
        """
        Given a mapping of chunks, write their contents to a temporary file,
        returning the path to that file.

        Returned file path should be manually removed by the user.

        :param chunk_map: Mapping of integer index to file-like chunk
        :type chunk_map: dict of (int, StringIO)
        :param file_extension: String extension to suffix the temporary file
            with
        :type file_extension: str

        :raises OSError: OS problems creating temporary file or writing it out.

        :return: Path to temporary combined file
        :rtype: str

        """
        # Make sure write dir exists...
        if not os.path.isdir(self.working_dir):
            safe_create_dir(self.working_dir)
        tmp_fd, tmp_path = tempfile.mkstemp(file_extension, dir=self.working_dir)
        self.log.debug("Combining chunks into temporary file: %s", tmp_path)
        # tmp_file = os.fdopen(tmp_fd, 'wb')
        tmp_file = open(tmp_path, "wb")
        for idx, chunk in sorted(chunk_map.items(), key=lambda p: p[0]):
            data = chunk.read()
            tmp_file.write(data)
        tmp_file.close()  # apparently also closes file descriptor?
        os.close(tmp_fd)
        return tmp_path
예제 #2
0
파일: flann.py 프로젝트: z-harry-sun/SMQTK
    def save_index(self):
        """
        Save the current index state to a configured location. This
        configuration should be set at instance construction.

        This will overwrite previously saved state date given the same
        configuration.

        :raises SimilarityIndexStateSaveError: Unable to save the current index
            state for some reason.

        """
        self._restore_index()
        if self._flann is None:
            raise SimilarityIndexStateSaveError("No index built yet to save")

        safe_create_dir(self._save_dir)
        self._flann.save_index(self._sf_flann_index)

        state = {
            "flann_params": self._flann_build_params,
            "descr_cache": self._descr_cache,
            "distance_method": self._distance_method,
            "rand_seed": self._rand_seed
        }
        with open(self._sf_state, 'wb') as f:
            cPickle.dump(state, f)
예제 #3
0
    def save_index(self, dir_path):
        """
        Save the current index state to a configured location. This
        configuration should be set at instance construction.

        This will overwrite previously saved state date given the same
        configuration.

        :raises SimilarityIndexStateSaveError: Unable to save the current index
            state for some reason.

        :param dir_path: Path to the directory to save the index to.
        :type dir_path: str

        """
        self._restore_index()
        if self._flann is None:
            raise SimilarityIndexStateSaveError("No index built yet to save")

        dir_path = osp.abspath(osp.expanduser(dir_path))
        safe_create_dir(dir_path)
        self._flann.save_index(osp.join(dir_path, self._sf_flann_index))

        state = {
            "flann_params": self._flann_build_params,
            "descr_cache": self._descr_cache,
            "distance_method": self._distance_method,
            "rand_seed": self._rand_seed
        }
        with open(osp.join(dir_path, self._sf_state), 'wb') as f:
            cPickle.dump(state, f)
예제 #4
0
    def get_preview_image(self, elem):
        """
        Get the filepath to the preview image for the given data element.

        :raises ValueError: Do not know how to generate a preview image for the
            given element's content type.

        :param elem: Data element to generate a preview image for.
        :type elem: smqtk.data_rep.DataElement

        :return: Path to the preview image for the given data element.
        :rtype: str

        """
        if elem.md5() in self._preview_cache:
            return self._preview_cache[elem.md5()]

        # else, generate preview image based on content type / content class
        if elem.content_type() in self.PREVIEW_GEN_METHOD:
            self._log.debug("Generating preview image based on content type: "
                            "%s", elem.content_type)
            safe_create_dir(self._cache_dir)
            return self.PREVIEW_GEN_METHOD[elem.content_type()](elem, self._cache_dir)
        else:
            content_class = elem.content_type().split('/', 1)[0]
            if content_class in self.PREVIEW_GEN_METHOD:
                self._log.debug("Generating preview image based on content "
                                "class: %s", content_class)
                safe_create_dir(self._cache_dir)
                return self.PREVIEW_GEN_METHOD[content_class](elem, self._cache_dir)
            else:
                raise ValueError("No preview generation method for the data "
                                 "element provided, of content type '%s'."
                                 % elem.content_type())
예제 #5
0
파일: itq.py 프로젝트: jonathan-owens/SMQTK
    def save_index(self, dir_path):
        """
        Save the current index state to a given location.

        This will overwrite a previously saved state given the same
        configuration.

        :raises SimilarityIndexStateSaveError: Unable to save the current index
            state for some reason.

        :param dir_path: Path to the directory to save the index to.
        :type dir_path: str

        """
        if self._r is None:
            raise SimilarityIndexStateSaveError("No index build yet to save.")

        state = {
            "bit_len": self._bit_len,
            "itq_iter": self._itq_iter_num,
            "rand_seed": self._rand_seed,
            "mean_vector": self._mean_vector,
            "rotation": self._r,
            "code_index": self._code_index,  # should be picklable
            "distance_method": self._dist_method
        }

        safe_create_dir(dir_path)
        save_file = osp.join(dir_path, self._save_file)
        with open(save_file, 'wb') as f:
            cPickle.dump(state, f)
예제 #6
0
    def write_temp(self, temp_dir=None):
        """
        Write this data's bytes to a temporary file on disk, returning the path
        to the written file, whose extension is guessed based on this data's
        content type.

        NOTE:
            The file path returned should not be explicitly removed by the user.
            Instead, the ``clean_temp()`` method should be called on this
            object.

        :param temp_dir: Optional directory to write temporary file in,
            otherwise we use the platform default temporary files directory.
        :type temp_dir: None or str

        :return: Path to the temporary file
        :rtype: str

        """
        if not hasattr(self, '_temp_filepath') or not self._temp_filepath:
            if temp_dir:
                safe_create_dir(temp_dir)
            # noinspection PyAttributeOutsideInit
            fd, self._temp_filepath = tempfile.mkstemp(
                suffix=MIMETYPES.guess_extension(self.content_type()),
                dir=temp_dir
            )
            os.close(fd)
            with open(self._temp_filepath, 'wb') as ofile:
                ofile.write(self.get_bytes())
        return self._temp_filepath
예제 #7
0
    def _save_data_elements(self):
        """
        Serialize out data elements in mapping into the root directory.
        """
        with self._element_map_lock:
            self._log.debug("Serializing data elements into: %s",
                            self._root_dir)
            for uuid, de in self._element_map.iteritems():
                # Remove any temporary files an element may have generated
                de.clean_temp()

                md5 = de.md5()
                # Leaving off trailing chunk so that we don't have a single
                # directory per md5-sum.
                containing_dir = \
                    os.path.join(self._root_dir,
                                 *partition_string(md5, self._md5_chunk))
                if not os.path.isdir(containing_dir):
                    safe_create_dir(containing_dir)

                output_fname = os.path.join(
                    containing_dir,
                    self.SERIAL_FILE_TEMPLATE % (str(uuid), md5)
                )
                with open(output_fname, 'wb') as ofile:
                    cPickle.dump(de, ofile)
            self._log.debug("Serializing data elements -- Done")
예제 #8
0
 def write_temp(d):
     """ Returns path to file written. Always creates new file. """
     if d:
         safe_create_dir(d)
     fd, fp = tempfile.mkstemp(suffix=MIMETYPES.guess_extension(self.content_type()), dir=d)
     os.close(fd)
     with open(fp, "wb") as f:
         f.write(self.get_bytes())
     return fp
예제 #9
0
    def set_vector(self, new_vec):
        """
        Set the contained vector.

        If this container already stores a descriptor vector, this will
        overwrite it.

        :param new_vec: New vector to contain.
        :type new_vec: numpy.core.multiarray.ndarray

        """
        safe_create_dir(osp.dirname(self._vec_filepath))
        numpy.save(self._vec_filepath, new_vec)
예제 #10
0
    def _get_checkpoint_dir(self, data):
        """
        The directory that contains checkpoint material for a given data element

        :param data: Data element
        :type data: smqtk.data_rep.DataElement

        :return: directory path
        :rtype: str

        """
        d = osp.join(self._work_dir, *partition_string(data.md5(), 8))
        safe_create_dir(d)
        return d
예제 #11
0
    def generate_model(self, descriptor_map, parallel=None, **kwargs):
        """
        Generate this indexers data-model using the given features,
        saving it to files in the configured data directory.

        :raises RuntimeError: Precaution error when there is an existing data
            model for this indexer. Manually delete or move the existing
            model before computing another one.

            Specific implementations may error on other things. See the specific
            implementations for more details.

        :raises ValueError: The given feature map had no content.

        :param descriptor_map: Mapping of integer IDs to feature data. All feature
            data must be of the same size!
        :type descriptor_map: dict of (int, numpy.core.multiarray.ndarray)

        :param parallel: Optionally specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type parallel: int

        """
        super(NaiveBayesMultinomial, self).generate_model(descriptor_map, parallel)

        num_features = len(descriptor_map)
        ordered_uids = sorted(descriptor_map.keys())

        sample_feature = descriptor_map[ordered_uids[0]]
        feature_len = len(sample_feature)

        # Pre-allocating arrays
        self._uid_array = []
        self._feature_mat = numpy.zeros(
            (num_features, feature_len), dtype=sample_feature.dtype
        )

        self.log.info("Populating feature matrix")
        for i, (uid, feat) in enumerate(descriptor_map.iteritems()):
            self._uid_array.append(uid)
            self._feature_mat[i] = feat

        with SimpleTimer("Saving data files", self.log.info):
            safe_create_dir(self.data_dir)
            with open(self.uid_list_filepath, 'wb') as ofile:
                cPickle.dump(self._uid_array, ofile)
            numpy.save(self.feature_mat_filepath, self._feature_mat)
예제 #12
0
파일: __init__.py 프로젝트: mrG7/SMQTK
 def write_temp(d):
     """ Returns path to file written. Always creates new file. """
     if d:
         safe_create_dir(d)
     ext = MIMETYPES.guess_extension(self.content_type())
     # Exceptions because mimetypes is apparently REALLY OLD
     if ext in {'.jpe', '.jfif'}:
         ext = '.jpg'
     fd, fp = tempfile.mkstemp(
         suffix=ext,
         dir=d
     )
     os.close(fd)
     with open(fp, 'wb') as f:
         f.write(self.get_bytes())
     return fp
예제 #13
0
    def test_existError_alreadyExists(self, mock_os_makedirs, mock_osp_exists):
        mock_os_makedirs.side_effect = OSError(errno.EEXIST,
                                               "Existing directory")

        mock_osp_exists.return_value = True

        dir_path = '/existing/dir'
        p = safe_create_dir(dir_path)

        ntools.assert_true(mock_os_makedirs.called)
        ntools.assert_true(mock_osp_exists.called)
        mock_osp_exists.assert_called_once_with(dir_path)
        ntools.assert_equal(p, dir_path)
예제 #14
0
파일: itq.py 프로젝트: mrG7/SMQTK
    def build_index(self, descriptors):
        """
        Build the index over the descriptor data elements.

        The first part of this method is equivalent to the compressITQ function
        from UNC-CH's implementation.

        :raises RuntimeError: A current data model is loaded, or the current
            CodeIndex is not empty.
        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptor elements to build index over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # Halt if we are going to overwrite a loaded mean/rotation cache.
        if not (self._mean_vector is None and self._r is None):
            raise RuntimeError("Current ITQ model is not empty (cached mean / "
                               "rotation). For the sake of protecting data, we "
                               "are not proceeding.")
        # Halt if the code index currently isn't empty
        if self.count():
            raise RuntimeError("Current CodeIndex instance is not empty. For "
                               "the sake of protecting data, we are not "
                               "proceeding.")

        self._log.debug("Using %d length bit-vectors", self._bit_len)

        # TODO: Sub-sample down descriptors to use for PCA + ITQ
        #       - Harry was also working on an iterative training approach so
        #           that we only have to have a limited number of vectors in
        #           memory at a time.
        if self._rand_seed:
            numpy.random.seed(self._rand_seed)

        with SimpleTimer("Creating descriptor matrix", self._log.info):
            x = []
            #: :type: list[smqtk.representation.DescriptorElement]
            descr_cache = []
            for d in descriptors:
                descr_cache.append(d)
                x.append(d.vector())
            if not x:
                raise ValueError("No descriptors given!")
            x = numpy.array(x)

        with SimpleTimer("Centering data", self._log.info):
            # center the data, VERY IMPORTANT for ITQ to work
            self._mean_vector = numpy.mean(x, axis=0)
            x -= self._mean_vector
        if self._mean_vec_cache_filepath:
            with SimpleTimer("Saving mean vector", self._log.info):
                safe_create_dir(osp.dirname(self._mean_vec_cache_filepath))
                numpy.save(self._mean_vec_cache_filepath, self._mean_vector)

        # PCA
        with SimpleTimer("Computing PCA transformation", self._log.info):
            # numpy and matlab observation format is flipped, thus added
            # transpose
            self._log.debug("-- computing covariance")
            c = numpy.cov(x.transpose())

            # Direct translation
            # - eigen vectors are the columns of ``pc``
            self._log.debug('-- computing linalg.eig')
            l, pc = numpy.linalg.eig(c)
            # ordered by greatest eigenvalue magnitude, keeping top ``bit_len``
            self._log.debug('-- computing top pairs')
            top_pairs = sorted(zip(l, pc.transpose()),
                               key=lambda p: p[0],
                               reverse=1
                               )[:self._bit_len]

            # # Harry translation -- Uses singular values / vectors, not eigen
            # # - singular vectors are the rows of pc
            # pc, l, _ = numpy.linalg.svd(c)
            # top_pairs = sorted(zip(l, pc),
            #                    key=lambda p: p[0],
            #                    reverse=1
            #                    )[:self._bit_len]

            # Eigen-vectors of top ``bit_len`` magnitude eigenvalues
            self._log.debug("-- top vector extraction")
            pc_top = numpy.array([p[1] for p in top_pairs]).transpose()
            self._log.debug("-- transform centered data by PC matrix")
            xx = numpy.dot(x, pc_top)

        # ITQ to find optimal rotation.
        #   `c` is the output codes for matrix `x`
        #   `r` is the rotation found by ITQ
        with SimpleTimer("Performing ITQ to find optimal rotation",
                         self._log.info):
            c, self._r = self._find_itq_rotation(xx, self._itq_iter_num)
            # De-adjust rotation with PC vector
            self._r = numpy.dot(pc_top, self._r)
        if self._rotation_cache_filepath:
            with SimpleTimer("Saving rotation matrix", self._log.info):
                safe_create_dir(osp.dirname(self._rotation_cache_filepath))
                numpy.save(self._rotation_cache_filepath, self._r)

        # Populating small-code index
        #   - Converting bit-vectors proved faster than creating new codes over
        #       again (~0.01s vs ~0.04s for 80 vectors).
        with SimpleTimer("Clearing code index", self._log.info):
            self._code_index.clear()
        with SimpleTimer("Converting bit-vectors into small codes, inserting "
                         "into code index", self._log.info):
            self._code_index.add_many_descriptors(
                (bit_utils.bit_vector_to_int(c[i]), descr_cache[i])
                for i in xrange(c.shape[0])
            )
예제 #15
0
    def _compute_descriptor(self, data):
        """
        Given some kind of data, process and return a feature vector as a Numpy
        array.

        :raises RuntimeError: Feature extraction failure of some kind.

        :param data: Some kind of input data for the feature descriptor. This is
            descriptor dependent.
        :type data: smqtk.data_rep.DataElement

        :return: Feature vector. This is a histogram of N bins where N is the
            number of centroids in the codebook. Bin values is percent
            composition, not absolute counts.
        :rtype: numpy.ndarray

        """
        super(ColorDescriptor_Base, self)._compute_descriptor(data)

        checkpoint_filepath = self._get_checkpoint_feature_file(data)
        # if osp.isfile(checkpoint_filepath):
        #     return numpy.load(checkpoint_filepath)

        if not self.has_model:
            raise RuntimeError("No model currently loaded! Check the existence "
                               "or, or generate, model files!\n"
                               "Codebook path: %s\n"
                               "FLANN Index path: %s"
                               % (self.codebook_filepath,
                                  self.flann_index_filepath))

        self.log.debug("Computing descriptors for data UID[%s]...", data.uuid())
        info, descriptors = self._generate_descriptor_matrices({data})

        if not self._use_sp:
            ###
            # Codebook Quantization
            #
            # - loaded the model at class initialization if we had one
            self.log.debug("Quantizing descriptors")
            pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION)
            flann = pyflann.FLANN()
            flann.load_index(self.flann_index_filepath, self._codebook)
            try:
                idxs, dists = flann.nn_index(descriptors)
            except AssertionError:

                self.log.error("Codebook shape  : %s", self._codebook.shape)
                self.log.error("Descriptor shape: %s", descriptors.shape)

                raise

            # Create histogram
            # - Using explicit bin slots to prevent numpy from automatically
            #   creating tightly constrained bins. This would otherwise cause
            #   histograms between two inputs to be non-comparable (unaligned
            #   bins).
            # - See numpy note about ``bins`` to understand why the +1 is
            #   necessary
            # - Learned from spatial implementation that we could feed multiple
            #   neighbors per descriptor into here, leading to a more populated
            #   histogram.
            #   - Could also possibly weight things based on dist from
            #     descriptor?
            #: :type: numpy.core.multiarray.ndarray
            h = numpy.histogram(idxs,  # indices are all integers
                                bins=numpy.arange(self._codebook.shape[0]+1))[0]
            # self.log.debug("Quantization histogram: %s", h)
            # Normalize histogram into relative frequencies
            # - Not using /= on purpose. h is originally int32 coming out of
            #   histogram. /= would keep int32 type when we want it to be
            #   transformed into a float type by the division.
            if h.sum():
                # noinspection PyAugmentAssignment
                h = h / float(h.sum())
            else:
                h = numpy.zeros(h.shape, h.dtype)
            # self.log.debug("Normalized histogram: %s", h)

        else:
            ###
            # Spatial Pyramid Quantization
            #
            self.log.debug("Quantizing descriptors using spatial pyramid")
            ##
            # Quantization factor - number of nearest codes to be saved
            q_factor = 10
            ##
            # Concatenating spatial information to descriptor vectors to format:
            #   [ x y <descriptor> ]
            self.log.debug("Creating combined descriptor matrix")
            m = numpy.concatenate((info[:, :2],
                                   descriptors), axis=1)
            ##
            # Creating quantized vectors, consisting vector:
            #   [ x y c_1 ... c_qf dist_1 ... dist_qf ]
            # which has a total size of 2+(qf*2)
            #
            # Sangmin's code included the distances in the quantized vector, but
            # then also passed this vector into numpy's histogram function with
            # integral bins, causing the [0,1] to be heavily populated, which
            # doesn't make sense to do.
            #   idxs, dists = flann.nn_index(m[:, 2:], q_factor)
            #   q = numpy.concatenate([m[:, :2], idxs, dists], axis=1)
            self.log.debug("Computing nearest neighbors")
            pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION)
            flann = pyflann.FLANN()
            flann.load_index(self.flann_index_filepath, self._codebook)
            idxs = flann.nn_index(m[:, 2:], q_factor)[0]
            self.log.debug("Creating quantization matrix")
            q = numpy.concatenate([m[:, :2], idxs], axis=1)
            ##
            # Build spatial pyramid from quantized matrix
            self.log.debug("Building spatial pyramid histograms")
            hist_sp = self._build_sp_hist(q, self._codebook.shape[0])
            ##
            # Combine each quadrants into single vector
            self.log.debug("Combining global+thirds into final histogram.")
            f = sys.float_info.min  # so as we don't div by 0 accidentally
            rf_norm = lambda h: h / (float(h.sum()) + f)
            h = numpy.concatenate([rf_norm(hist_sp[0]),
                                   rf_norm(hist_sp[5]),
                                   rf_norm(hist_sp[6]),
                                   rf_norm(hist_sp[7])],
                                  axis=1)
            # noinspection PyAugmentAssignment
            h /= h.sum()

        self.log.debug("Saving checkpoint feature file")
        if not osp.isdir(osp.dirname(checkpoint_filepath)):
            safe_create_dir(osp.dirname(checkpoint_filepath))
        numpy.save(checkpoint_filepath, h)

        return h
예제 #16
0
    def generate_model(self, descriptor_map, parallel=None, **kwargs):
        """
        Generate this indexers data-model using the given features,
        saving it to files in the configured data directory.

        :raises RuntimeError: Precaution error when there is an existing data
            model for this indexer. Manually delete or move the existing
            model before computing another one.

            Specific implementations may error on other things. See the specific
            implementations for more details.

        :raises ValueError: The given feature map had no content.

        :param descriptor_map: Mapping of integer IDs to feature data. All feature
            data must be of the same size!
        :type descriptor_map: dict of (int, numpy.core.multiarray.ndarray)

        :param parallel: Optionally specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type parallel: int

        """
        if self.has_model():
            raise RuntimeError("WARNING: This implementation already has a "
                               "model generated. These can take a long time to "
                               "generate, thus we require external manual "
                               "removal of modal files before we will generate "
                               "a new model.")

        num_features = len(descriptor_map)
        ordered_uids = sorted(descriptor_map.keys())

        sample_feature = descriptor_map[ordered_uids[0]]
        feature_len = len(sample_feature)

        # Pre-allocating arrays
        #: :type: list[collections.Hashable]
        self._uid_array = []
        self._feature_mat = numpy.zeros(
            (num_features, feature_len), dtype=sample_feature.dtype
        )
        self._distance_mat = numpy.zeros(
            (num_features, num_features), dtype=sample_feature.dtype
        )

        with SimpleTimer("Populating feature matrix", self.log.info):
            for i, (uid, feat) in enumerate(descriptor_map.iteritems()):
                self._uid_array.append(uid)
                self._feature_mat[i] = feat

        with SimpleTimer("Computing HI matrix kernel", self.log.info):
            # Using [process] Pool here with large sets eats far too much RAM.
            # Using a ThreadPool here is actually much slower. Not sure why?
            for i in range(num_features):
                for j in range(i, num_features):
                    self._distance_mat[i, j] = self._distance_mat[j, i] = \
                        histogram_intersection_distance(self._feature_mat[i],
                                                        self._feature_mat[j])

        with SimpleTimer("Saving data files", self.log.info):
            safe_create_dir(self.data_dir)
            with open(self.uid_list_filepath, 'wb') as ofile:
                cPickle.dump(self._uid_array, ofile)
            numpy.save(self.feature_mat_filepath, self._feature_mat)
            numpy.save(self.distance_mat_filepath, self._distance_mat)
예제 #17
0
 def temp_dir(self):
     return safe_create_dir(osp.join(self._work_dir, 'temp_files'))
예제 #18
0
    def test_noExists(self, mock_os_makedirs):
        dir_path = "/some/directory/somewhere"
        p = safe_create_dir(dir_path)

        ntools.assert_true(mock_os_makedirs.called)
        ntools.assert_equals(p, dir_path)
예제 #19
0
 def flann_index_filepath(self):
     safe_create_dir(self._model_dir)
     return osp.join(self._model_dir,
                     "%s.flann_index.dat" % (self.descriptor_type(),))
예제 #20
0
 def flann_params_filepath(self):
     safe_create_dir(self._model_dir)
     return osp.join(self._model_dir,
                     "%s.flann_params.json" % (self.descriptor_type(),))
예제 #21
0
 def codebook_filepath(self):
     safe_create_dir(self._model_dir)
     return osp.join(self._model_dir,
                     "%s.codebook.npy" % (self.descriptor_type(),))
예제 #22
0
 def work_dir(self):
     if not osp.isdir(self._work_dir):
         safe_create_dir(self._work_dir)
     return self._work_dir
예제 #23
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptors data elements.

        Subsequent calls to this method should rebuild the index, not add to it.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the old
                cache away.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptors elements to build index over.
        :type descriptors: collections.Iterable[smqtk.data_rep.DescriptorElement]

        """
        # If there is already an index, clear the cache file if we are in the
        # same process that created our current index.
        if self._flann_index_cache and os.path.isfile(self._flann_index_cache) \
                and self._pid == multiprocessing.current_process().pid:
            self._log.debug('removing old index cache file')
            os.remove(self._flann_index_cache)

        self._log.debug("Building new index")

        # Compute descriptors for data elements
        self._log.debug("Computing descriptors for data")
        # uid2vec = \
        #     self._content_descriptor.compute_descriptor_async(data)
        # Translate returned mapping into cache lists
        self._descr_cache = [d for d in sorted(descriptors,
                                               key=lambda e: e.uuid())]
        if not self._descr_cache:
            raise ValueError("No data provided in given iterable.")

        # numpy array version for FLANN
        pts_array = [d.vector() for d in self._descr_cache]
        pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)

        # Reset PID/FLANN/saved cache
        self._pid = multiprocessing.current_process().pid
        safe_create_dir(self._temp_dir)
        fd, self._flann_index_cache = tempfile.mkstemp(".flann",
                                                       dir=self._temp_dir)
        os.close(fd)
        self._log.debug("Building FLANN index")
        params = {
            "algorithm": self._build_autotune,
            "target_precision": self._build_target_precision,
            "sample_fraction": self._build_sample_frac,
            "log_level": ("info"
                          if self._log.getEffectiveLevel() <= logging.DEBUG
                          else "warn")
        }
        if self._rand_seed is not None:
            params['random_seed'] = self._rand_seed
        pyflann.set_distance_type(self._distance_method)
        self._flann = pyflann.FLANN()
        self._flann_build_params = self._flann.build_index(pts_array, **params)

        # Saving out index cache
        self._log.debug("Saving index to cache file: %s",
                        self._flann_index_cache)
        self._flann.save_index(self._flann_index_cache)
예제 #24
0
파일: iqr_session.py 프로젝트: mrG7/SMQTK
 def work_dir(self):
     safe_create_dir(self._work_dir)
     return self._work_dir
예제 #25
0
def ffmpeg_extract_frame_map(video_filepath, second_offset=0,
                             second_interval=0, max_duration=0, frames=(),
                             output_image_ext="png", parallel=None,
                             ffmpeg_exe='ffmpeg'):
    """
    Return a mapping of video frame index to image file in the given output
    format.

    If frames requested have not yet been extracted (based on what's contained
    in the specified output directory), they are done now. This means that this
    method could take a little time to complete if there are many frames in the
    video file and this is the first time this is being called.

    This may return an empty list if there are no frames in the video for
    the specified, or default, constraints.

    Extracted frames are cached in a directory structure under the globally
    configured work directory ``<smqtk_config.WORK_DIR>/VideoFrameExtraction``.
    Frames are extracted into separate directories based on the MD5 sum of the
    video file.

    :raises RuntimeError: No frames were extracted.

    :param second_offset: Seconds into the video to start extracting
    :type second_offset: float

    :param second_interval: Number of seconds between extracted frames
    :type second_interval: float

    :param max_duration: Maximum number of seconds worth of extracted frames
        (starting from the specified offset). If <=0, we extract until the end
        of the video.
    :type max_duration: float

    :param frames: Specific exact frame numbers within the video to extract.
        Providing explicit frames causes offset, interval and duration
        parameters to be ignored and only the frames specified here to be
        extracted and returned.
    :type frames: collections.Iterable[int]

    :param parallel: Number of processes to use for frame extraction. This is
        None by default, meaning that all available cores/threads are used.
    :type parallel: int or None

    :param ffmpeg_exe: ffmpeg executable to use for frame extraction. By
        default, we attempt to use what is available of the PATH.
    :type ffmpeg_exe: str or unicode

    :return: Map of frame-to-filepath for requested video frames
    :rtype: dict of (int, str)

    """
    log = logging.getLogger('smqtk.utils.video_utils.extract_frame_map')

    video_md = get_metadata_info(video_filepath)
    video_md5sum = hashlib.md5(open(video_filepath, 'rb').read()).hexdigest()
    frame_output_dir = os.path.join(smqtk_config.WORK_DIR,
                                    "VideoFrameExtraction",
                                    *string_utils.partition_string(video_md5sum,
                                                                   8))
    safe_create_dir(frame_output_dir)

    def filename_for_frame(frame, ext):
        """
        method standard filename for a given frame file
        """
        return "%08d.%s" % (frame, ext.lstrip('.'))

    def iter_frames_for_interval():
        """
        Return a generator expression yielding frame numbers from the input
        video that match the given query parameters. Indices returned are
        0-based (i.e. first frame is 0, not 1).

        We are making a sensible assumption that we are not dealing with frame
        speeds of over 1000Hz and rounding frame frame times to the neared
        thousandth of a second to mitigate floating point error.

        :rtype: list of int

        """
        _log = logging.getLogger('smqtk.utils.video_utils.extract_frame_map'
                                 '.iter_frames_for_interval')
        num_frames = int(video_md.fps * video_md.duration)
        first_frame = second_offset * video_md.fps
        _log.debug("First frame: %f", first_frame)
        if max_duration > 0:
            cutoff_frame = min(float(num_frames),
                               (max_duration + second_offset) * video_md.fps)
        else:
            cutoff_frame = float(num_frames)
        _log.debug("Cutoff frame: %f", cutoff_frame)
        if second_interval:
            incr = second_interval * video_md.fps
        else:
            incr = 1.0
        _log.debug("Frame increment: %f", incr)

        # Interpolate
        yield first_frame
        next_frm = first_frame + incr
        while next_frm < cutoff_frame:
            _log.debug("-- adding frame: %f", next_frm)
            yield int(next_frm)
            next_frm += incr

    def extract_frames(frames_to_process):
        """
        Extract specific frames from the input video file using ffmpeg. If not
        all frames could be extracted, we return what we were able to extract.

        :param frames_to_process: Mapping of frame-number:filepath pairs to
            extract from the input video.
        :type frames_to_process: dict[int,str or unicode]

        :return: List of frames that were successfully extracted.
        :rtype: list[int]

        """
        _log = logging.getLogger('smqtk.utils.video_utils.extract_frame_map'
                                 '.extract_frames')

        # Setup temp extraction directory
        tmp_extraction_dir = os.path.join(frame_output_dir, ".TMP")
        if os.path.isdir(tmp_extraction_dir):
            _log.debug("Existing temp director found, removing and starting "
                       "over")
            shutil.rmtree(tmp_extraction_dir, ignore_errors=True)
        os.makedirs(tmp_extraction_dir)

        p = multiprocessing.Pool(parallel)
        # Mapping of frame to (result, output_filepath)
        #: :type: dict of (int, (AsyncResult, str))
        rmap = {}
        for f, ofp in frames_to_process.iteritems():
            tfp = os.path.join(tmp_extraction_dir,
                               filename_for_frame(f, output_image_ext))
            t = f / video_md.fps
            rmap[f] = (
                p.apply_async(ffmpeg_extract_frame,
                              args=(t, video_filepath, tfp, ffmpeg_exe)),
                tfp
            )
        p.close()
        # Check for failures
        extracted_frames = []
        for f, ofp in frames_to_process.iteritems():
            r, tfp = rmap[f]
            r.get()  # wait for finish
            if not os.path.isfile(tfp):
                _log.warn("Failed to generated file for frame %d", f)
            else:
                extracted_frames.append(f)
                os.rename(tfp, ofp)
        p.join()
        del p

        os.removedirs(tmp_extraction_dir)
        _log.debug("Frame extraction complete")

        return extracted_frames

    # Determine frames to extract from video
    extract_indices = set()
    if frames:
        log.debug("Only extracting specified frames: %s", frames)
        extract_indices.update(frames)
    else:
        log.debug("Determining frames needed for specification: "
                  "offset: %f, interval: %f, max_duration: %f",
                  second_offset, second_interval, max_duration)
        extract_indices.update(iter_frames_for_interval())

    if not extract_indices:
        return {}

    # frame/filename map that will be returned based on requested frames
    frame_map = dict(
        (i, os.path.join(frame_output_dir,
                         filename_for_frame(i, output_image_ext)))
        for i in extract_indices
    )

    ###
    # Acquire a file-base lock in output directory so that we don't conflict
    # with another process extracting frames to the same directory.
    #
    # NOTE: This method is prone to starvation if many processes are trying
    #       to extract to the same video frames, but not yet probably due to
    #       existing use cases.
    #
    lock_file = os.path.join(frame_output_dir, '.lock')
    log.debug("Acquiring file lock in '%s'...", frame_output_dir)
    while not file_utils.exclusive_touch(lock_file):
        # This is sufficiently small to be fine grained, but not so small to
        # burn the CPU.
        time.sleep(0.01)
    log.debug("Acquiring file lock -> Acquired!")

    try:
        ###
        # Determine frames to actually extract base on existing files (if any)
        #
        #: :type: dict[int, str]
        frames_to_process = {}
        existing_frames = []
        for i, img_file in sorted(frame_map.items()):
            if not os.path.isfile(img_file):
                log.debug('frame %d needs processing', i)
                frames_to_process[i] = img_file
            else:
                existing_frames.append(i)

        ###
        # Extract needed frames via hook function that provides
        # implementation.
        #
        if frames_to_process:
            frames_extracted = extract_frames(frames_to_process)

            if (len(existing_frames) + len(frames_extracted)) == 0:
                raise RuntimeError("Failed to extract any frames for video")

        return frame_map
    finally:
        os.remove(lock_file)