class FeatureMemory (object): """ Class for encapsulating and managing feature and kernel matrices for different feature types """ @classmethod def construct_from_files(cls, id_vector_file, bg_flags_file, feature_mat_file, kernel_mat_file, rw_lock=None): """ Initialize FeatureMemory object from file sources. :param id_vector_file: File containing the numpy.savetxt(...) output of clip ID values in the order in which they associate to the rows of the kernel matrix. :type id_vector_file: str :param feature_mat_file: File containing the kernel matrix as saved by numpy.save(...) (saved as an ndarray, converted to matrix on load). :type feature_mat_file: str :param kernel_mat_file: File containing the kernel matrix as saved by numpy.save(...) (saved as an ndarray, converted to matrix on load). :type kernel_mat_file: str :param bg_flags_file: Optional file containing output of numpy.savetxt(...) where each index maps a row index of the kernel to whether or not the associated clip ID should be considered a background video or not. :type bg_flags_file: str :return: Symmetric FeatureMemory constructed with the data provided in the provided files. :rtype: FeatureMemory """ clip_ids = np.array(np.load(id_vector_file)) bg_flags = np.array(np.load(bg_flags_file)) # noinspection PyCallingNonCallable feature_mat = np.matrix(np.load(feature_mat_file)) # noinspection PyCallingNonCallable kernel_mat = np.matrix(np.load(kernel_mat_file)) bg_clips = set([clip_ids[i] for i, f in enumerate(bg_flags) if f]) return FeatureMemory(clip_ids, bg_clips, feature_mat, kernel_mat, rw_lock=rw_lock) @property def _log(self): return logging.getLogger('.'.join([self.__module__, self.__class__.__name__])) def __init__(self, id_vector, bg_clip_ids, feature_mat, kernel_mat, rw_lock=None): """ Initialize this FeatureMemory object This class must be used with numpy ndarray and matrix classes for shared memory purposes. NOTE: Arrays and matrices given here must own their data! This is currently required in order to resize them later when updating with new feature vectors. A ValueError will be thrown if an given array/matrix does not own its data. TODO: Allow kernel matrix to be optional, causing it to be built from the provided feature matrix (not a recommended action). :param id_vector: (numpy) Array of clip IDs. This is used as the map from an index position to the clip ID its associated with in the kernel and distance kernel matrices. :type id_vector: ndarray of int :param bg_clip_ids: Set of clip IDs that are to be treated as background clip IDs. :type bg_clip_ids: set of int :param feature_mat: (numpy) Matrix of features for clip IDs. Features should be stored vertically, i.e. Each row is a feature for a particular clip ID (id_vector being the index-to-clipID map). :type feature_mat: matrix of double :param kernel_mat: (numpy) Matrix detailing the distances between feature vectors. This must be a square, symmetric matrix. :type kernel_mat: matrix of double :param rw_lock: Optional ReadWriteLock for this instance to use. If not provided, we will create our own. :type rw_lock: None or ReadWriteLock """ # assert isinstance(id_vector, (ndarray, ArrayProxy)), \ # "ID vector not given as a numpy.ndarray!" assert isinstance(bg_clip_ids, (set, frozenset)), \ "Background ID vector not a numpy.ndarray!" # assert isinstance(feature_mat, (matrix, MatrixProxy)), \ # "Kernel matrix not a numpy.matrix!" # assert isinstance(kernel_mat, (matrix, MatrixProxy)), \ # "Distance kernel not a numpy.matrix!" # noinspection PyUnresolvedReferences # -> base IS a member of the matrix class... if id_vector.base is not None: raise ValueError("Given ``id_vector`` does not own its data! It " "will not be transformable later.") elif feature_mat.base is not None: raise ValueError("Given ``feature_mat`` does not own its data! It " "will not be transformable later.") elif kernel_mat.base is not None: raise ValueError("Given ``kernel_mat`` does not own its data! It " "will not be transformable later.") # The kernel should be square and should be the same size as the feature # matrix's number or rows (unique stored clip features). if not (kernel_mat.shape[0] == kernel_mat.shape[1] == feature_mat.shape[0]): raise ValueError("The distance kernel matrix provided is either " "misshapen or conflicts with the dimensions of " "the provided feature matrix. (kernel matrix " "shape: %s, num feature vectors: %d" % (kernel_mat.shape, feature_mat.shape[0])) self._log.debug("Lock given: %s", rw_lock) if rw_lock: assert isinstance(rw_lock, ReadWriteLock), \ "Not given a value ReadWriteLock instance!" self._rw_lock = rw_lock else: self._log.debug("Falling back on bad lock given (given: %s)", type(rw_lock)) self._rw_lock = ReadWriteLock() self._id_vector = id_vector self._bg_clip_ids = bg_clip_ids self._feature_mat = feature_mat self._kernel_mat = kernel_mat # Helper structure mapping clipIDs to their row index self._cid2idx_map = dict((cid, idx) for idx, cid in enumerate(self._id_vector)) @staticmethod def _histogram_intersection_distance(a, b): """ Calculates distance between two vectors using histogram intersection. Non-branching version of the histogram intersection algorithm. :param a: A vector in array form. :type a: ndarray :param b: A vector in array form. :type b: ndarray :return: Histogram Intersection (HI) distance scalar :rtype: double """ # noinspection PyUnresolvedReferences return (a + b - np.abs(a - b)).sum() * 0.5 def get_ids(self): """ NOTE: NOT THREAD SAFE. Use the returned structure only in conjunction with this object's lock when in a parallel environment to prevent possible memory corruption. :return: Ordered vector of clip IDs along the row-edge of this object's feature matrix and along both edges of the kernel matrix. :rtype: numpy.core.multiarray.ndarray """ return self._id_vector def get_bg_ids(self): """ NOTE: NOT THREAD SAFE. Use the returned structure only in conjunction with this object's lock when in a parallel environment to prevent possible memory corruption. :return: Ordered vector of clip IDs that we are treating as background clips. :rtype: ndarray """ return frozenset(self._bg_clip_ids) def get_feature_matrix(self): """ NOTE: NOT THREAD SAFE. Use the returned structure only in conjunction with this object's lock when in a parallel environment to prevent possible memory corruption. :return: Matrix recording feature vectors for a feature type. See the id vector for row-wise index-to-clipID association. :rtype: numpy.matrixlib.defmatrix.matrix """ return self._feature_mat def get_kernel_matrix(self): """ NOTE: NOT THREAD SAFE. Use the returned structure only in conjunction with this object's lock when in a parallel environment to prevent possible memory corruption. :return: Symmetric matrix detailing the distances between any two clip ID features. Distances are computed via histogram intersection. :rtype: matrix """ return self._kernel_mat def get_lock(self): """ :return: a reference to this object's read/write lock. :rtype: ReadWriteLock """ return self._rw_lock def get_distance_kernel(self): """ DistanceKernel object constructed from this feature's current state. :return: This feature distance kernel. :rtype: DistanceKernel """ with self._rw_lock.read_lock(): return DistanceKernel(self._id_vector, self._id_vector, self._kernel_mat, self._bg_clip_ids, self._rw_lock) def get_feature(self, *clip_id_or_ids): """ Return the a matrix where each row is the feature vector for one or more clip IDs. The given list of clip IDs given acts as the index-to-clipID map for the returned matrix's rows. If repeat clip IDs are provided in the input, there will be repeat feature vectors in the returned matrix. Raises ValueError if the given clip ID is not represented in the current matrix. :param clip_id_or_ids: One or more integer clip IDs to retrieve the feature vectors for. :type clip_id_or_ids: tuple of int :return: NxM matrix, where N is the number of clip IDs requested and M is the length of a feature vector for this vector. :rtype: np.matrix """ assert all(isinstance(e, int) for e in clip_id_or_ids), \ "Not given an integer or a valid iterable over integers!" with self._rw_lock.read_lock(): # rows = num of IDs given, cols = width of feature matrix with SimpleTimer("Allocating return matrix", self._log.debug): # noinspection PyUnresolvedReferences # -> matrix class DOES have ``dtype`` property... ret_mat = matrix(ndarray((len(clip_id_or_ids), self._feature_mat.shape[1]), self._feature_mat.dtype)) for i, cid in enumerate(clip_id_or_ids): feature_idx = self._cid2idx_map[cid] ret_mat[i, :] = self._feature_mat[feature_idx, :] return ret_mat # noinspection PyUnresolvedReferences,PyCallingNonCallable def update(self, clip_id, feature_vec=None, is_background=False, timeout=None): """ Update this feature with a feature vector associated with a clip ID. If clip ID is already in the feature matrix, we replace the current vector with the given one. Either way, the distance kernel is updated with either a new row/column, or updating relevant slots in the existing distance kernel. :raise ValueError: if the given feature vector is not compatible with our feature vector. :raise RuntimeError: If a timeout is given and the underlying write lock doesn't acquire in that amount of time. :param clip_id: The ID of the clip the given ``feature_vec`` represents. :type clip_id: int :param feature_vec: Feature vector associated to the given clip ID. :type feature_vec: ndarray :param is_background: Flag declaring that this clip ID represents a background feature. :type is_background: bool :param timeout: Timeout seconds for the underlying write lock to acquire before a RuntimeError is thrown. :type timeout: None or int or float """ with self._rw_lock.write_lock(timeout): clip_id = int(clip_id) if feature_vec is not None and \ not (feature_vec.ndim == 1 and len(feature_vec) == self._feature_mat.shape[1]): raise ValueError("Given feature vector not compatible " "(dimensionality or length does not match)") # Update the given feature vector and kernel distances if self._cid2idx_map.get(clip_id, None) is not None: # In all cases, update the background status of the clip if is_background: self._bg_clip_ids.add(clip_id) else: self._bg_clip_ids.discard(clip_id) # If we were given a new feature vector, update entries if feature_vec is not None: idx = self._cid2idx_map[clip_id] self._feature_mat[idx] = feature_vec new_dist = np.mat(tuple( self._histogram_intersection_distance(feature_vec, fv) for fv in self._feature_mat )) self._kernel_mat[idx, :] = new_dist self._kernel_mat[:, idx] = new_dist # Given a new feature to add. else: if feature_vec is None: raise ValueError("Update given a new clip ID, but no " "feature vector provided.") # Update internal feature matrix with added vector self._cid2idx_map[clip_id] = self._id_vector.size self._id_vector.resize((self._id_vector.size + 1,), refcheck=False) self._id_vector[-1] = clip_id if is_background: self._bg_clip_ids.add(clip_id) # noinspection PyUnresolvedReferences if self._feature_mat.base is not None: raise RuntimeError("Feature matrix does not own its data") # Since we're only adding a new row, this resize does not affect # the positioning of the existing data. # noinspection PyUnresolvedReferences self._feature_mat.resize((self._feature_mat.shape[0] + 1, self._feature_mat.shape[1]), refcheck=False ) self._feature_mat[-1, :] = feature_vec # Need to add a new row AND column to the distance kernel. if self._kernel_mat.base is not None: raise RuntimeError("kernel matrix does not own its data") assert self._kernel_mat.shape[0] == self._kernel_mat.shape[1], \ "kernel matrix is not symmetric for some reason???" # noinspection PyPep8Naming # -> because I like ``N`` better... N = self._kernel_mat.shape[0] kernel_copy = np.matrix(self._kernel_mat) self._kernel_mat.resize((N+1, N+1), refcheck=False) self._kernel_mat[:N, :N] = kernel_copy del kernel_copy # Computing new feature distance (histogram intersection). Only # need to compute this once because of HI being being # commutative and the kernel matrix being symmetric. dist_vec = np.mat(tuple( self._histogram_intersection_distance(feature_vec, fv) for fv in self._feature_mat )) self._kernel_mat[-1, :] = dist_vec self._kernel_mat[:, -1] = dist_vec.T
class FeatureMemory(object): """ Class for encapsulating and managing feature and kernel matrices for different feature types """ @classmethod def construct_from_files(cls, id_vector_file, bg_flags_file, feature_mat_file, kernel_mat_file, rw_lock=None): """ Initialize FeatureMemory object from file sources. :param id_vector_file: File containing the numpy.savetxt(...) output of clip ID values in the order in which they associate to the rows of the kernel matrix. :type id_vector_file: str :param feature_mat_file: File containing the kernel matrix as saved by numpy.save(...) (saved as an ndarray, converted to matrix on load). :type feature_mat_file: str :param kernel_mat_file: File containing the kernel matrix as saved by numpy.save(...) (saved as an ndarray, converted to matrix on load). :type kernel_mat_file: str :param bg_flags_file: Optional file containing output of numpy.savetxt(...) where each index maps a row index of the kernel to whether or not the associated clip ID should be considered a background video or not. :type bg_flags_file: str :return: Symmetric FeatureMemory constructed with the data provided in the provided files. :rtype: FeatureMemory """ clip_ids = np.array(np.load(id_vector_file)) bg_flags = np.array(np.load(bg_flags_file)) # noinspection PyCallingNonCallable feature_mat = np.matrix(np.load(feature_mat_file)) # noinspection PyCallingNonCallable kernel_mat = np.matrix(np.load(kernel_mat_file)) bg_clips = set([clip_ids[i] for i, f in enumerate(bg_flags) if f]) return FeatureMemory(clip_ids, bg_clips, feature_mat, kernel_mat, rw_lock=rw_lock) @property def _log(self): return logging.getLogger('.'.join( [self.__module__, self.__class__.__name__])) def __init__(self, id_vector, bg_clip_ids, feature_mat, kernel_mat, rw_lock=None): """ Initialize this FeatureMemory object This class must be used with numpy ndarray and matrix classes for shared memory purposes. NOTE: Arrays and matrices given here must own their data! This is currently required in order to resize them later when updating with new feature vectors. A ValueError will be thrown if an given array/matrix does not own its data. TODO: Allow kernel matrix to be optional, causing it to be built from the provided feature matrix (not a recommended action). :param id_vector: (numpy) Array of clip IDs. This is used as the map from an index position to the clip ID its associated with in the kernel and distance kernel matrices. :type id_vector: ndarray of int :param bg_clip_ids: Set of clip IDs that are to be treated as background clip IDs. :type bg_clip_ids: set of int :param feature_mat: (numpy) Matrix of features for clip IDs. Features should be stored vertically, i.e. Each row is a feature for a particular clip ID (id_vector being the index-to-clipID map). :type feature_mat: matrix of double :param kernel_mat: (numpy) Matrix detailing the distances between feature vectors. This must be a square, symmetric matrix. :type kernel_mat: matrix of double :param rw_lock: Optional ReadWriteLock for this instance to use. If not provided, we will create our own. :type rw_lock: None or ReadWriteLock """ # assert isinstance(id_vector, (ndarray, ArrayProxy)), \ # "ID vector not given as a numpy.ndarray!" assert isinstance(bg_clip_ids, (set, frozenset)), \ "Background ID vector not a numpy.ndarray!" # assert isinstance(feature_mat, (matrix, MatrixProxy)), \ # "Kernel matrix not a numpy.matrix!" # assert isinstance(kernel_mat, (matrix, MatrixProxy)), \ # "Distance kernel not a numpy.matrix!" # noinspection PyUnresolvedReferences # -> base IS a member of the matrix class... if id_vector.base is not None: raise ValueError("Given ``id_vector`` does not own its data! It " "will not be transformable later.") elif feature_mat.base is not None: raise ValueError("Given ``feature_mat`` does not own its data! It " "will not be transformable later.") elif kernel_mat.base is not None: raise ValueError("Given ``kernel_mat`` does not own its data! It " "will not be transformable later.") # The kernel should be square and should be the same size as the feature # matrix's number or rows (unique stored clip features). if not (kernel_mat.shape[0] == kernel_mat.shape[1] == feature_mat.shape[0]): raise ValueError("The distance kernel matrix provided is either " "misshapen or conflicts with the dimensions of " "the provided feature matrix. (kernel matrix " "shape: %s, num feature vectors: %d" % (kernel_mat.shape, feature_mat.shape[0])) self._log.debug("Lock given: %s", rw_lock) if rw_lock: assert isinstance(rw_lock, ReadWriteLock), \ "Not given a value ReadWriteLock instance!" self._rw_lock = rw_lock else: self._log.debug("Falling back on bad lock given (given: %s)", type(rw_lock)) self._rw_lock = ReadWriteLock() self._id_vector = id_vector self._bg_clip_ids = bg_clip_ids self._feature_mat = feature_mat self._kernel_mat = kernel_mat # Helper structure mapping clipIDs to their row index self._cid2idx_map = dict( (cid, idx) for idx, cid in enumerate(self._id_vector)) @staticmethod def _histogram_intersection_distance(a, b): """ Calculates distance between two vectors using histogram intersection. Non-branching version of the histogram intersection algorithm. :param a: A vector in array form. :type a: ndarray :param b: A vector in array form. :type b: ndarray :return: Histogram Intersection (HI) distance scalar :rtype: double """ # noinspection PyUnresolvedReferences return (a + b - np.abs(a - b)).sum() * 0.5 def get_ids(self): """ NOTE: NOT THREAD SAFE. Use the returned structure only in conjunction with this object's lock when in a parallel environment to prevent possible memory corruption. :return: Ordered vector of clip IDs along the row-edge of this object's feature matrix and along both edges of the kernel matrix. :rtype: numpy.core.multiarray.ndarray """ return self._id_vector def get_bg_ids(self): """ NOTE: NOT THREAD SAFE. Use the returned structure only in conjunction with this object's lock when in a parallel environment to prevent possible memory corruption. :return: Ordered vector of clip IDs that we are treating as background clips. :rtype: ndarray """ return frozenset(self._bg_clip_ids) def get_feature_matrix(self): """ NOTE: NOT THREAD SAFE. Use the returned structure only in conjunction with this object's lock when in a parallel environment to prevent possible memory corruption. :return: Matrix recording feature vectors for a feature type. See the id vector for row-wise index-to-clipID association. :rtype: numpy.matrixlib.defmatrix.matrix """ return self._feature_mat def get_kernel_matrix(self): """ NOTE: NOT THREAD SAFE. Use the returned structure only in conjunction with this object's lock when in a parallel environment to prevent possible memory corruption. :return: Symmetric matrix detailing the distances between any two clip ID features. Distances are computed via histogram intersection. :rtype: matrix """ return self._kernel_mat def get_lock(self): """ :return: a reference to this object's read/write lock. :rtype: ReadWriteLock """ return self._rw_lock def get_distance_kernel(self): """ DistanceKernel object constructed from this feature's current state. :return: This feature distance kernel. :rtype: DistanceKernel """ with self._rw_lock.read_lock(): return DistanceKernel(self._id_vector, self._id_vector, self._kernel_mat, self._bg_clip_ids, self._rw_lock) def get_feature(self, *clip_id_or_ids): """ Return the a matrix where each row is the feature vector for one or more clip IDs. The given list of clip IDs given acts as the index-to-clipID map for the returned matrix's rows. If repeat clip IDs are provided in the input, there will be repeat feature vectors in the returned matrix. Raises ValueError if the given clip ID is not represented in the current matrix. :param clip_id_or_ids: One or more integer clip IDs to retrieve the feature vectors for. :type clip_id_or_ids: tuple of int :return: NxM matrix, where N is the number of clip IDs requested and M is the length of a feature vector for this vector. :rtype: np.matrix """ assert all(isinstance(e, int) for e in clip_id_or_ids), \ "Not given an integer or a valid iterable over integers!" with self._rw_lock.read_lock(): # rows = num of IDs given, cols = width of feature matrix with SimpleTimer("Allocating return matrix", self._log.debug): # noinspection PyUnresolvedReferences # -> matrix class DOES have ``dtype`` property... ret_mat = matrix( ndarray((len(clip_id_or_ids), self._feature_mat.shape[1]), self._feature_mat.dtype)) for i, cid in enumerate(clip_id_or_ids): feature_idx = self._cid2idx_map[cid] ret_mat[i, :] = self._feature_mat[feature_idx, :] return ret_mat # noinspection PyUnresolvedReferences,PyCallingNonCallable def update(self, clip_id, feature_vec=None, is_background=False, timeout=None): """ Update this feature with a feature vector associated with a clip ID. If clip ID is already in the feature matrix, we replace the current vector with the given one. Either way, the distance kernel is updated with either a new row/column, or updating relevant slots in the existing distance kernel. :raise ValueError: if the given feature vector is not compatible with our feature vector. :raise RuntimeError: If a timeout is given and the underlying write lock doesn't acquire in that amount of time. :param clip_id: The ID of the clip the given ``feature_vec`` represents. :type clip_id: int :param feature_vec: Feature vector associated to the given clip ID. :type feature_vec: ndarray :param is_background: Flag declaring that this clip ID represents a background feature. :type is_background: bool :param timeout: Timeout seconds for the underlying write lock to acquire before a RuntimeError is thrown. :type timeout: None or int or float """ with self._rw_lock.write_lock(timeout): clip_id = int(clip_id) if feature_vec is not None and \ not (feature_vec.ndim == 1 and len(feature_vec) == self._feature_mat.shape[1]): raise ValueError("Given feature vector not compatible " "(dimensionality or length does not match)") # Update the given feature vector and kernel distances if self._cid2idx_map.get(clip_id, None) is not None: # In all cases, update the background status of the clip if is_background: self._bg_clip_ids.add(clip_id) else: self._bg_clip_ids.discard(clip_id) # If we were given a new feature vector, update entries if feature_vec is not None: idx = self._cid2idx_map[clip_id] self._feature_mat[idx] = feature_vec new_dist = np.mat( tuple( self._histogram_intersection_distance( feature_vec, fv) for fv in self._feature_mat)) self._kernel_mat[idx, :] = new_dist self._kernel_mat[:, idx] = new_dist # Given a new feature to add. else: if feature_vec is None: raise ValueError("Update given a new clip ID, but no " "feature vector provided.") # Update internal feature matrix with added vector self._cid2idx_map[clip_id] = self._id_vector.size self._id_vector.resize((self._id_vector.size + 1, ), refcheck=False) self._id_vector[-1] = clip_id if is_background: self._bg_clip_ids.add(clip_id) # noinspection PyUnresolvedReferences if self._feature_mat.base is not None: raise RuntimeError("Feature matrix does not own its data") # Since we're only adding a new row, this resize does not affect # the positioning of the existing data. # noinspection PyUnresolvedReferences self._feature_mat.resize((self._feature_mat.shape[0] + 1, self._feature_mat.shape[1]), refcheck=False) self._feature_mat[-1, :] = feature_vec # Need to add a new row AND column to the distance kernel. if self._kernel_mat.base is not None: raise RuntimeError("kernel matrix does not own its data") assert self._kernel_mat.shape[0] == self._kernel_mat.shape[1], \ "kernel matrix is not symmetric for some reason???" # noinspection PyPep8Naming # -> because I like ``N`` better... N = self._kernel_mat.shape[0] kernel_copy = np.matrix(self._kernel_mat) self._kernel_mat.resize((N + 1, N + 1), refcheck=False) self._kernel_mat[:N, :N] = kernel_copy del kernel_copy # Computing new feature distance (histogram intersection). Only # need to compute this once because of HI being being # commutative and the kernel matrix being symmetric. dist_vec = np.mat( tuple( self._histogram_intersection_distance(feature_vec, fv) for fv in self._feature_mat)) self._kernel_mat[-1, :] = dist_vec self._kernel_mat[:, -1] = dist_vec.T
class DistanceKernel(object): """ Feature Distance Kernel object. This class allows the kernel to either be symmetric or not. If it is symmetric, the ``symmetric_submatrix`` function becomes available. Intended to be used with ProxyManager proxy objects (given at construction) MONKEY PATCHING: When using this object directly (not using the ProxyManager stuff) and sending it over pipes, the ReadWriteLock needs to be monkey patched out (the multiprocessing.Condition variable doesn't play nicely). Need to set an instance of a DummyRWLock to the DistanceKernel._rw_lock property. For example: ... dk = ... dk._rw_lock = DummyRWLock() <send dk into a pipe> ... """ @classmethod def construct_symmetric_from_files(cls, id_vector_file, kernel_mat_file, bg_flags_file=None): """ Construct a symmetric DistanceKernel object, requiring a background flags file to denote clip IDs that are to be treated as background clips (required to activate symmetric_submatrix function). Such a DistanceKernel is usually used with event learning and should be provided a background flags file also. :param id_vector_file: File containing the numpy.savetxt(...) output of clip ID values in the order in which they associate to the rows of the kernel matrix. :type id_vector_file: str :param kernel_mat_file: File containing the kernel matrix as saved by numpy.save(...) (saved as an ndarray, converted to matrix on load). :type kernel_mat_file: str :param bg_flags_file: Optional file containing output of numpy.savetxt(...) where each index maps a row index of the kernel to whether or not the associated clip ID should be considered a background video or not. :type bg_flags_file: str :return: Symmetric DistanceKernel constructed with the data provided in the provided files. :rtype: DistanceKernel """ clip_ids = np.array(np.loadtxt(id_vector_file)) # noinspection PyCallingNonCallable kernel_mat = np.matrix(np.load(kernel_mat_file)) if bg_flags_file is not None: bg_flags = np.array(np.loadtxt(bg_flags_file)) bg_clips = np.array( [clip_ids[i] for i, e in enumerate(bg_flags) if e]) else: bg_clips = None return DistanceKernel(clip_ids, clip_ids, kernel_mat, bg_clips) @classmethod def construct_asymmetric_from_files(cls, row_ids_file, col_ids_file, kernel_mat_file): """ Construct an asymmetric DistanceKernel object, usually used for archive searches. No option for providing background clip IDs as asymmetric kernels are NOT used for learning purposes. :param row_ids_file: File containing the numpy.savetxt(...) output of clip ID values in the order in which they associate to the rows of the given kernel matrix. :type row_ids_file: str :param col_ids_file: File containing the numpy.savetxt(...) output of clip ID values in the order in which they associate to the columns of the given kernel matrix. :type col_ids_file: str :param kernel_mat_file: File containing the kernel matrix as saved by numpy.save(...) (saved as an ndarray, converted to matrix on load). :type kernel_mat_file: str :return: Asymmetric DistanceKernel constructed with the data provided in the provided files. :rtype: DistanceKernel """ row_cids = np.array(np.loadtxt(row_ids_file)) col_cids = np.array(np.loadtxt(col_ids_file)) # noinspection PyCallingNonCallable kernel_mat = np.matrix(np.load(kernel_mat_file)) return DistanceKernel(row_cids, col_cids, kernel_mat) @property def _log(self): return logging.getLogger('.'.join( [self.__module__, self.__class__.__name__])) def __init__(self, row_id_index_map, col_id_index_map, kernel_mat, bg_clip_ids=None, rw_lock=None): """ Initialize the kernel matrix. The initialization values will more than likely be proxies to np.matrix objects. The ``bg_clip_ids`` array may be given when this kernel matrix is to be a square, symmetric kernel and activates the use of the ``symmetric_submatrix`` method. This array must list clip IDs that are to be considered "background" IDs, or clips that are to always be considered negative. These clip IDs must be included in symmetric sub-matrices. This array must be the same dimension as the row and column indices, containing boolean flags. These flags mark that the clip ID found at the same index in the edge ID maps should be considered a "background" ID, or one that is always treated as a negative. This is for the stipulation in the symmetric_submatrix method that it always includes the background ID set in the submatrix. :param row_id_index_map: Array of clip IDs associated to row indices. Contents will be treated as ints. :type row_id_index_map: ndarray of int :param col_id_index_map: Array of clip IDs associated to row indices. Contents will be treated as ints. :type col_id_index_map: ndarray of int :param kernel_mat: Kernel data matrix. :type kernel_mat: matrix :param bg_clip_ids: Optional array of boolean flags, marking whether an index should be considered a "background" video. Contents will be treated as ints. :type bg_clip_ids: set of int :param rw_lock: Read-Write lock for data provided. This should be provided if the any of the data is shared with other objects/ sources. If this is given None (default), then a lock is created. :type rw_lock: ReadWriteLock or None """ # TODO: Possibly add checks for the id arrays like there is for the # bgclipid array (int-able contents) assert row_id_index_map.shape[0] == kernel_mat.shape[0], \ "Length of row index map and kernel row count did not match! " \ "(row index map: %d, kernel row count: %d)" \ % (row_id_index_map.shape[0], kernel_mat.shape[0]) assert col_id_index_map.shape[0] == kernel_mat.shape[1], \ "Length of col index map and kernel col count did not match! " \ "(col index map: %d, kernel col count: %d)" \ % (col_id_index_map.shape[0], kernel_mat.shape[1]) self._row_id_index_map = row_id_index_map self._col_id_index_map = col_id_index_map self._kernel = kernel_mat assert ((bg_clip_ids is None) or isinstance(bg_clip_ids, (set, frozenset))), \ "Must either given None or a set for the bg_clip_ids " \ "vector. Got: %s" % type(bg_clip_ids) self._bg_cid_set = bg_clip_ids if bg_clip_ids is not None: try: [int(e) for e in bg_clip_ids] except Exception: raise ValueError("Not all of the contents of of bg_clip_ids " "could be treated as ints!") if rw_lock: assert isinstance(rw_lock, ReadWriteLock), \ "Did not receive valid istance of RW Lock. Got '%s'" \ % type(rw_lock) self._rw_lock = rw_lock else: self._rw_lock = ReadWriteLock() def get_lock(self): """ :return: This object's read/write lock. :rtype: ReadWriteLock """ return self._rw_lock def row_id_map(self): """ :return: Row index-to-clipID map :rtype: ndarray """ with self.get_lock().read_lock(): return self._row_id_index_map def col_id_map(self): """ :return: Column index-to-clipID map :rtype: ndarray """ with self.get_lock().read_lock(): return self._col_id_index_map def get_kernel_matrix(self): """ RETURNED OBJECTS NOT THREAD/PROCESS SAFE. Once retrieved, if matrix may be modified by another thread/process :return: The underlying kernel matrix. :rtype: matrix """ with self.get_lock().read_lock(): return self._kernel def get_background_ids(self): """ RETURNED OBJECTS NOT THREAD/PROCESS SAFE :return: The set of background clip IDs. May be None if there was no background set initialized. :rtype: None or frozenset """ with self.get_lock().read_lock(): return frozenset(self._bg_cid_set) \ if self._bg_cid_set is not None \ else frozenset() def is_symmetric(self): """ :return: True if this is a square kernel matrix. This means that clip IDs along the row and column axes are the same and in the same order (starting from [0,0] and moving outwards). :rtype: bool """ with self._rw_lock.read_lock(): # Doing shape equality short circuit because the return value of # numpy.array equality changes depending on this condition, meaning # the use of the ...all() member function on the result is not # universally possible (i.e. when it returns a bool value when # shapes are not equal). # noinspection PyUnresolvedReferences return (self._row_id_index_map.shape == self._col_id_index_map.shape and (self._row_id_index_map == self._col_id_index_map).all()) def symmetric_submatrix(self, *clip_ids): """ Return a symmetric sub NxN matrix of the total distance kernel based on the clip IDs provided. The background clips will always be included in the matrix if this DistanceKernel was constructed with a list of background clip IDs. Clip IDs provided will be assumed non-background, or positive event examples. If the clip ID of a background video is provided as an argument, we will reconsider it as a non-background video in the returned index-to-is-background mapping (tuple). Note: The matrix returned will always be a new instance and not set up to use shared memory. When directly used with shared memory objects, it will be passed by value, not by reference. :param clip_ids: Integer clip IDs to include in the returned matrix. The returned matrix will contain all background clip IDs. :type clip_ids: Iterable of int :return: The index-to-clipID map (tuple), the index-to-is-background map (tuple) and the symmetric NxN submatrix, where N is the number of clip IDs provided as arguments plus the number of background IDs, minus the overlap between those two sets. :rtype: tuple of int, tuple of bool, numpy.matrixlib.defmatrix.matrix """ with self._rw_lock.read_lock(): with SimpleTimer("Checking inputs", self._log.debug): if not self.is_symmetric(): raise RuntimeError("Cannot get a symmetric sub-matrix if " "the kernel is not square!") # DEPRECATED: Allowing the use of this method without explicitly # providing background cIDs. This object will # probably not ever be used this way, but there's no # reason to explicitly disallow it. # if self._bg_cid_vec is None: # raise RuntimeError("Cannot create the square submatrix " # "without the background flag vector!") try: clip_ids = [int(e) for e in clip_ids] except: raise ValueError("Not all clip IDs could be used as ints!") id_diff = set(clip_ids).difference(self._row_id_index_map) assert not id_diff, \ "Not all clip IDs provided are represented in this " \ "distance kernel matrix! (difference: %s)" \ % id_diff del id_diff with SimpleTimer("Computing union of BG clips and provided IDs", self._log.debug): if self._bg_cid_set is not None: all_cids = self._bg_cid_set.union(clip_ids) else: all_cids = set(clip_ids) # Reorder the given clip IDs so that they are in the same relative # order as the kernel matrix edges. focus_indices = [] focus_clipids = [] for idx, cid in enumerate(self._row_id_index_map): if (cid in all_cids) and (cid not in focus_clipids): focus_indices.append(idx) focus_clipids.append(cid) # index-to-isBG map for return # -> IDs provided as arguments are to be considered non-background, # even if a the ID is in the background set. All other IDs in the # union then must be from the background set. focus_id2isbg = [] for idx in focus_indices: cid = self._row_id_index_map[idx] focus_id2isbg.append(False if cid in clip_ids else True) ret_mat = self._kernel[focus_indices, :][:, focus_indices] return focus_clipids, focus_id2isbg, ret_mat # noinspection PyPep8Naming def extract_rows(self, *clipID_or_IDs): """ Find and return the v-stacked distance vectors, in kernel row order (i.e. not in the order given as arguments), of the kernel rows matching the given clip IDs. Note: The matrix returned will always be a new instance and not set up to use shared memory. When directly used with shared memory objects, it will be passed by value, not by reference. :param clipID_or_IDs: The integer clip ID or IDs of which to get the distance vectors for. :type clipID_or_IDs: int or Iterable of int :return: The row-wise index-to-clipID map (tuple), the column-wise index-to-clipID map (tuple), and the KxL shape matrix, where K is the number of clip IDs given to the method, and L is the width (columns) of the distance kernel. :rtype: tuple of int, tuple of int, matrix """ with self._rw_lock.read_lock(): with SimpleTimer("Checking inputs", self._log.debug): try: clipID_or_IDs = frozenset(int(e) for e in clipID_or_IDs) except Exception, ex: raise ValueError("Not all clip IDs could be used as ints: " "%s" % str(ex)) id_diff = clipID_or_IDs.difference(self._row_id_index_map) assert not id_diff, \ "Not all clip IDs provided are represented in this " \ "distance kernel matrix! (difference: %s)" \ % id_diff del id_diff # Reorder the given clip IDs so that they are in the same relative # order as the kernel matrix edge order with SimpleTimer("Creating focus index/cid sequence", self._log.debug): focus_row_indices = [] focus_row_clipids = [] for idx, cid in enumerate(self._row_id_index_map): # if ((cid in clipID_or_IDs) # and (cid not in focus_row_clipids)): if cid in clipID_or_IDs: focus_row_indices.append(idx) focus_row_clipids.append(cid) with SimpleTimer("Cropping kernel to focus range", self._log.debug): return (tuple(focus_row_clipids), tuple(self._col_id_index_map), self._kernel[focus_row_indices, :])
class DistanceKernel (object): """ Feature Distance Kernel object. This class allows the kernel to either be symmetric or not. If it is symmetric, the ``symmetric_submatrix`` function becomes available. Intended to be used with ProxyManager proxy objects (given at construction) MONKEY PATCHING: When using this object directly (not using the ProxyManager stuff) and sending it over pipes, the ReadWriteLock needs to be monkey patched out (the multiprocessing.Condition variable doesn't play nicely). Need to set an instance of a DummyRWLock to the DistanceKernel._rw_lock property. For example: ... dk = ... dk._rw_lock = DummyRWLock() <send dk into a pipe> ... """ @classmethod def construct_symmetric_from_files(cls, id_vector_file, kernel_mat_file, bg_flags_file=None): """ Construct a symmetric DistanceKernel object, requiring a background flags file to denote clip IDs that are to be treated as background clips (required to activate symmetric_submatrix function). Such a DistanceKernel is usually used with event learning and should be provided a background flags file also. :param id_vector_file: File containing the numpy.savetxt(...) output of clip ID values in the order in which they associate to the rows of the kernel matrix. :type id_vector_file: str :param kernel_mat_file: File containing the kernel matrix as saved by numpy.save(...) (saved as an ndarray, converted to matrix on load). :type kernel_mat_file: str :param bg_flags_file: Optional file containing output of numpy.savetxt(...) where each index maps a row index of the kernel to whether or not the associated clip ID should be considered a background video or not. :type bg_flags_file: str :return: Symmetric DistanceKernel constructed with the data provided in the provided files. :rtype: DistanceKernel """ clip_ids = np.array(np.loadtxt(id_vector_file)) # noinspection PyCallingNonCallable kernel_mat = np.matrix(np.load(kernel_mat_file)) if bg_flags_file is not None: bg_flags = np.array(np.loadtxt(bg_flags_file)) bg_clips = np.array([clip_ids[i] for i, e in enumerate(bg_flags) if e]) else: bg_clips = None return DistanceKernel(clip_ids, clip_ids, kernel_mat, bg_clips) @classmethod def construct_asymmetric_from_files(cls, row_ids_file, col_ids_file, kernel_mat_file): """ Construct an asymmetric DistanceKernel object, usually used for archive searches. No option for providing background clip IDs as asymmetric kernels are NOT used for learning purposes. :param row_ids_file: File containing the numpy.savetxt(...) output of clip ID values in the order in which they associate to the rows of the given kernel matrix. :type row_ids_file: str :param col_ids_file: File containing the numpy.savetxt(...) output of clip ID values in the order in which they associate to the columns of the given kernel matrix. :type col_ids_file: str :param kernel_mat_file: File containing the kernel matrix as saved by numpy.save(...) (saved as an ndarray, converted to matrix on load). :type kernel_mat_file: str :return: Asymmetric DistanceKernel constructed with the data provided in the provided files. :rtype: DistanceKernel """ row_cids = np.array(np.loadtxt(row_ids_file)) col_cids = np.array(np.loadtxt(col_ids_file)) # noinspection PyCallingNonCallable kernel_mat = np.matrix(np.load(kernel_mat_file)) return DistanceKernel(row_cids, col_cids, kernel_mat) @property def _log(self): return logging.getLogger('.'.join([self.__module__, self.__class__.__name__])) def __init__(self, row_id_index_map, col_id_index_map, kernel_mat, bg_clip_ids=None, rw_lock=None): """ Initialize the kernel matrix. The initialization values will more than likely be proxies to np.matrix objects. The ``bg_clip_ids`` array may be given when this kernel matrix is to be a square, symmetric kernel and activates the use of the ``symmetric_submatrix`` method. This array must list clip IDs that are to be considered "background" IDs, or clips that are to always be considered negative. These clip IDs must be included in symmetric sub-matrices. This array must be the same dimension as the row and column indices, containing boolean flags. These flags mark that the clip ID found at the same index in the edge ID maps should be considered a "background" ID, or one that is always treated as a negative. This is for the stipulation in the symmetric_submatrix method that it always includes the background ID set in the submatrix. :param row_id_index_map: Array of clip IDs associated to row indices. Contents will be treated as ints. :type row_id_index_map: ndarray of int :param col_id_index_map: Array of clip IDs associated to row indices. Contents will be treated as ints. :type col_id_index_map: ndarray of int :param kernel_mat: Kernel data matrix. :type kernel_mat: matrix :param bg_clip_ids: Optional array of boolean flags, marking whether an index should be considered a "background" video. Contents will be treated as ints. :type bg_clip_ids: set of int :param rw_lock: Read-Write lock for data provided. This should be provided if the any of the data is shared with other objects/ sources. If this is given None (default), then a lock is created. :type rw_lock: ReadWriteLock or None """ # TODO: Possibly add checks for the id arrays like there is for the # bgclipid array (int-able contents) assert row_id_index_map.shape[0] == kernel_mat.shape[0], \ "Length of row index map and kernel row count did not match! " \ "(row index map: %d, kernel row count: %d)" \ % (row_id_index_map.shape[0], kernel_mat.shape[0]) assert col_id_index_map.shape[0] == kernel_mat.shape[1], \ "Length of col index map and kernel col count did not match! " \ "(col index map: %d, kernel col count: %d)" \ % (col_id_index_map.shape[0], kernel_mat.shape[1]) self._row_id_index_map = row_id_index_map self._col_id_index_map = col_id_index_map self._kernel = kernel_mat assert ((bg_clip_ids is None) or isinstance(bg_clip_ids, (set, frozenset))), \ "Must either given None or a set for the bg_clip_ids " \ "vector. Got: %s" % type(bg_clip_ids) self._bg_cid_set = bg_clip_ids if bg_clip_ids is not None: try: [int(e) for e in bg_clip_ids] except Exception: raise ValueError("Not all of the contents of of bg_clip_ids " "could be treated as ints!") if rw_lock: assert isinstance(rw_lock, ReadWriteLock), \ "Did not receive valid istance of RW Lock. Got '%s'" \ % type(rw_lock) self._rw_lock = rw_lock else: self._rw_lock = ReadWriteLock() def get_lock(self): """ :return: This object's read/write lock. :rtype: ReadWriteLock """ return self._rw_lock def row_id_map(self): """ :return: Row index-to-clipID map :rtype: ndarray """ with self.get_lock().read_lock(): return self._row_id_index_map def col_id_map(self): """ :return: Column index-to-clipID map :rtype: ndarray """ with self.get_lock().read_lock(): return self._col_id_index_map def get_kernel_matrix(self): """ RETURNED OBJECTS NOT THREAD/PROCESS SAFE. Once retrieved, if matrix may be modified by another thread/process :return: The underlying kernel matrix. :rtype: matrix """ with self.get_lock().read_lock(): return self._kernel def get_background_ids(self): """ RETURNED OBJECTS NOT THREAD/PROCESS SAFE :return: The set of background clip IDs. May be None if there was no background set initialized. :rtype: None or frozenset """ with self.get_lock().read_lock(): return frozenset(self._bg_cid_set) \ if self._bg_cid_set is not None \ else frozenset() def is_symmetric(self): """ :return: True if this is a square kernel matrix. This means that clip IDs along the row and column axes are the same and in the same order (starting from [0,0] and moving outwards). :rtype: bool """ with self._rw_lock.read_lock(): # Doing shape equality short circuit because the return value of # numpy.array equality changes depending on this condition, meaning # the use of the ...all() member function on the result is not # universally possible (i.e. when it returns a bool value when # shapes are not equal). # noinspection PyUnresolvedReferences return (self._row_id_index_map.shape == self._col_id_index_map.shape and (self._row_id_index_map == self._col_id_index_map).all()) def symmetric_submatrix(self, *clip_ids): """ Return a symmetric sub NxN matrix of the total distance kernel based on the clip IDs provided. The background clips will always be included in the matrix if this DistanceKernel was constructed with a list of background clip IDs. Clip IDs provided will be assumed non-background, or positive event examples. If the clip ID of a background video is provided as an argument, we will reconsider it as a non-background video in the returned index-to-is-background mapping (tuple). Note: The matrix returned will always be a new instance and not set up to use shared memory. When directly used with shared memory objects, it will be passed by value, not by reference. :param clip_ids: Integer clip IDs to include in the returned matrix. The returned matrix will contain all background clip IDs. :type clip_ids: Iterable of int :return: The index-to-clipID map (tuple), the index-to-is-background map (tuple) and the symmetric NxN submatrix, where N is the number of clip IDs provided as arguments plus the number of background IDs, minus the overlap between those two sets. :rtype: tuple of int, tuple of bool, numpy.matrixlib.defmatrix.matrix """ with self._rw_lock.read_lock(): with SimpleTimer("Checking inputs", self._log.debug): if not self.is_symmetric(): raise RuntimeError("Cannot get a symmetric sub-matrix if " "the kernel is not square!") # DEPRECATED: Allowing the use of this method without explicitly # providing background cIDs. This object will # probably not ever be used this way, but there's no # reason to explicitly disallow it. # if self._bg_cid_vec is None: # raise RuntimeError("Cannot create the square submatrix " # "without the background flag vector!") try: clip_ids = [int(e) for e in clip_ids] except: raise ValueError("Not all clip IDs could be used as ints!") id_diff = set(clip_ids).difference(self._row_id_index_map) assert not id_diff, \ "Not all clip IDs provided are represented in this " \ "distance kernel matrix! (difference: %s)" \ % id_diff del id_diff with SimpleTimer("Computing union of BG clips and provided IDs", self._log.debug): if self._bg_cid_set is not None: all_cids = self._bg_cid_set.union(clip_ids) else: all_cids = set(clip_ids) # Reorder the given clip IDs so that they are in the same relative # order as the kernel matrix edges. focus_indices = [] focus_clipids = [] for idx, cid in enumerate(self._row_id_index_map): if (cid in all_cids) and (cid not in focus_clipids): focus_indices.append(idx) focus_clipids.append(cid) # index-to-isBG map for return # -> IDs provided as arguments are to be considered non-background, # even if a the ID is in the background set. All other IDs in the # union then must be from the background set. focus_id2isbg = [] for idx in focus_indices: cid = self._row_id_index_map[idx] focus_id2isbg.append(False if cid in clip_ids else True) ret_mat = self._kernel[focus_indices, :][:, focus_indices] return focus_clipids, focus_id2isbg, ret_mat # noinspection PyPep8Naming def extract_rows(self, *clipID_or_IDs): """ Find and return the v-stacked distance vectors, in kernel row order (i.e. not in the order given as arguments), of the kernel rows matching the given clip IDs. Note: The matrix returned will always be a new instance and not set up to use shared memory. When directly used with shared memory objects, it will be passed by value, not by reference. :param clipID_or_IDs: The integer clip ID or IDs of which to get the distance vectors for. :type clipID_or_IDs: int or Iterable of int :return: The row-wise index-to-clipID map (tuple), the column-wise index-to-clipID map (tuple), and the KxL shape matrix, where K is the number of clip IDs given to the method, and L is the width (columns) of the distance kernel. :rtype: tuple of int, tuple of int, matrix """ with self._rw_lock.read_lock(): with SimpleTimer("Checking inputs", self._log.debug): try: clipID_or_IDs = frozenset(int(e) for e in clipID_or_IDs) except Exception, ex: raise ValueError("Not all clip IDs could be used as ints: " "%s" % str(ex)) id_diff = clipID_or_IDs.difference(self._row_id_index_map) assert not id_diff, \ "Not all clip IDs provided are represented in this " \ "distance kernel matrix! (difference: %s)" \ % id_diff del id_diff # Reorder the given clip IDs so that they are in the same relative # order as the kernel matrix edge order with SimpleTimer("Creating focus index/cid sequence", self._log.debug): focus_row_indices = [] focus_row_clipids = [] for idx, cid in enumerate(self._row_id_index_map): # if ((cid in clipID_or_IDs) # and (cid not in focus_row_clipids)): if cid in clipID_or_IDs: focus_row_indices.append(idx) focus_row_clipids.append(cid) with SimpleTimer("Cropping kernel to focus range", self._log.debug): return ( tuple(focus_row_clipids), tuple(self._col_id_index_map), self._kernel[focus_row_indices, :] )