Пример #1
0
    def fit(self, descriptors, use_multiprocessing=True):
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param use_multiprocessing: If multiprocessing should be used, as
            opposed to threading, for collecting descriptor vectors from the
            provided iterable.
        :type use_multiprocessing: bool

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.get_logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not hasattr(descriptors, "__len__"):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            pr = ProgressReporter(self._log.debug, dbg_report_interval).start()
            for d in descriptors:
                descriptors_l.append(d)
                dbg_report_interval and pr.increment_report()
            dbg_report_interval and pr.report()
            descriptors = descriptors_l
        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors,
                               report_interval=dbg_report_interval,
                               use_multiprocessing=use_multiprocessing)
        self._log.debug("descriptor matrix shape: %s", x.shape)
        n, dim = x.shape

        self._log.debug("Generating random projections")
        np.random.seed(self.random_seed)
        self.rps = np.random.randn(dim, self.bit_length)

        self._log.debug("Info normalizing descriptors with norm type: %s",
                        self.normalize)
        return self.get_hash(x)
Пример #2
0
    def _update_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to additively update
        the current index with the one or more descriptor elements given.

        If no index exists yet, a new one should be created using the given
        descriptors.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to add to this
            index.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")
            # tee out iterable for use in adding to index as well as hash code
            # generation.
            d_for_index, d_for_hashing = itertools.tee(descriptors, 2)

            self._log.debug("Updating descriptor index.")
            self.descriptor_set.add_many_descriptors(d_for_index)

            self._log.debug("Generating hash codes for new descriptors")
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            #: :type: collections.deque[numpy.ndarray[bool]]
            hash_vectors = collections.deque()  # for updating hash_index
            # for updating kv-store after collecting new hash codes
            kvstore_update = {}
            for d in d_for_hashing:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                # Get, update and reinsert hash UUID set object.
                if h_int not in kvstore_update:
                    #: :type: set
                    kvstore_update[h_int] = \
                        self.hash2uuids_kvstore.get(h_int, set())
                kvstore_update[h_int] |= {d.uuid()}
                prog_reporter.increment_report()
            prog_reporter.report()

            self._log.debug("Updating kv-store with new hash codes")
            self.hash2uuids_kvstore.add_many(kvstore_update)
            del kvstore_update

            if self.hash_index is not None:
                self._log.debug("Updating hash index structure.")
                self.hash_index.update_index(hash_vectors)
Пример #3
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index with
        the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError(
                    "Cannot modify container attributes due to "
                    "being in read-only mode.")

            self._log.debug("Clearing and adding new descriptor elements")
            self.descriptor_index.clear()
            self.descriptor_index.add_many_descriptors(descriptors)

            self._log.debug("Generating hash codes")
            #: :type: collections.deque[numpy.ndarray[bool]]
            hash_vectors = collections.deque()
            self.hash2uuids_kvstore.clear()
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            # We just cleared the previous store, so aggregate new kv-mapping
            # in ``kvstore_update`` for single update after loop.
            kvstore_update = collections.defaultdict(set)
            for d in self.descriptor_index:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                kvstore_update[h_int] |= {d.uuid()}
                prog_reporter.increment_report()
            prog_reporter.report()
            self.hash2uuids_kvstore.add_many(kvstore_update)
            del kvstore_update

            if self.hash_index is not None:
                self._log.debug("Clearing and building hash index of type %s",
                                type(self.hash_index))
                # a build is supposed to clear previous state.
                self.hash_index.build_index(hash_vectors)
Пример #4
0
def run_file_list(c,
                  filelist_filepath,
                  checkpoint_filepath,
                  batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    :param check_image: Enable checking image loading from file before queueing
        that file for processing. If the check fails, the file is skipped
        instead of a halting exception being raised.
    :type check_image: bool

    """
    log = logging.getLogger(__name__)

    file_paths = [line.strip() for line in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    descriptor_set = cast(
        DescriptorSet,
        from_config_dict(c['descriptor_set'], DescriptorSet.get_impls()))

    # ``data_set`` added to within the ``iter_valid_elements`` function.
    data_set: Optional[DataSet] = None
    if c['optional_data_set']['type'] is None:
        log.info("Not saving loaded data elements to data set")
    else:
        log.info("Initializing data set to append to")
        data_set = cast(
            DataSet,
            from_config_dict(c['optional_data_set'], DataSet.get_impls()))

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    generator = cast(
        DescriptorGenerator,
        from_config_dict(c['descriptor_generator'],
                         DescriptorGenerator.get_impls()))

    def iter_valid_elements():
        def is_valid(file_path):
            e = DataFileElement(file_path)

            if is_valid_element(
                    e,
                    valid_content_types=generator.valid_content_types(),
                    check_image=check_image):
                return e
            else:
                return False

        data_elements: Deque[DataFileElement] = collections.deque()
        valid_files_filter = parallel.parallel_map(is_valid,
                                                   file_paths,
                                                   name="check-file-type",
                                                   use_multiprocessing=True)
        for dfe in valid_files_filter:
            if dfe:
                yield dfe
                if data_set is not None:
                    data_elements.append(dfe)
                    if batch_size and len(data_elements) == batch_size:
                        log.debug(
                            "Adding data element batch to set (size: %d)",
                            len(data_elements))
                        data_set.add_data(*data_elements)
                        data_elements.clear()
        # elements only collected if we have a data-set configured, so add any
        # still in the deque to the set
        if data_set is not None and data_elements:
            log.debug("Adding data elements to set (size: %d",
                      len(data_elements))
            data_set.add_data(*data_elements)

    log.info("Computing descriptors")
    m = compute_many_descriptors(
        iter_valid_elements(),
        generator,
        factory,
        descriptor_set,
        batch_size=batch_size,
    )

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, 'w')
    cf_writer = csv.writer(cf)
    try:
        pr = ProgressReporter(log.debug, 1.0).start()
        for de, descr in m:
            # We know that we are using DataFileElements going into the
            # compute_many_descriptors, so we can assume that's what comes out
            # of it as well.
            # noinspection PyProtectedMember
            cf_writer.writerow([de._filepath, descr.uuid()])
            pr.increment_report()
        pr.report()
    finally:
        del cf_writer
        cf.close()

    log.info("Done")
Пример #5
0
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True):
    """
    Method for computing the distance kernel of an array of vectors given a
    distance function that works on two supplied 1D arrays.

    For a valid distance function interface, see
    ``smqtk.utils.distance_functions.histogram_intersection_distance2``.

    :param m: An array of vectors to compute the pairwise distance kernel for.
    :type m: numpy.ndarray

    :param dist_func: Distance function
    :type dist_func: (ndarray, ndarray) -> ndarray[float] | float

    :param row_wise: If the given distance function can take a vector and a
        matrix, and computes pair-wise distances, returning a vector of
        distances between the given vector and each row of the matrix.
    :type row_wise: bool

    :param parallel: If distances should be calculated in parallel. This is true
        by default.
    :type parallel: bool

    :return: Computed symmetric distance kernel
    :rtype: numpy.ndarray

    """
    log = logging.getLogger(__name__)

    if m.ndim == 1:
        m = m[np.newaxis]

    log.info("Computing distance kernel")
    side = m.shape[0]
    mat = np.ndarray((side, side), dtype=float)

    pr = ProgressReporter(log.debug, 1.0)
    if row_wise:
        log.debug("Computing row-wise distances")
        # For all rows except the last one. We'll have computed all distances by
        # the time reach m[side-1]
        if parallel:
            # noinspection PyShadowingNames
            def work_func(i):
                mat[i, i] = dist_func(m[i], m[i])
                if i < (side - 1):
                    mat[i + 1:,
                        i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :])

            # Using threading for in-place modification
            pr.start()
            for _ in parallel_map(work_func,
                                  range(side),
                                  use_multiprocessing=False):
                pr.increment_report()
        else:
            pr.start()
            for i in range(side):
                # Compute col/row wise distances
                mat[i, i] = dist_func(m[i], m[i])
                if i < (side - 1):
                    mat[i + 1:,
                        i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :])
                pr.increment_report()
    else:
        log.debug("Computing element-wise distances")
        if parallel:
            # noinspection PyShadowingNames
            def work_func(i):
                mat[i, i] = dist_func(m[i], m[i])
                # cols to the left of diagonal index for this row
                for j in range(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])

            # Using threading for in-place modification
            pr.start()
            for _ in parallel_map(work_func,
                                  range(side),
                                  use_multiprocessing=False):
                pr.increment_report()
        else:
            pr.start()
            for i in range(side):
                mat[i, i] = dist_func(m[i], m[i])
                # cols to the left of diagonal index for this row
                for j in range(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])
                pr.increment_report()
    pr.report()
    return mat
Пример #6
0
    def fit(self, descriptors, use_multiprocessing=True):
        """
        Fit the ITQ model given the input set of descriptors.

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param use_multiprocessing: If multiprocessing should be used, as
            opposed to threading, when collecting descriptor elements from the
            given iterable.
        :type use_multiprocessing: bool

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = 1.0
        dbg_report = self.get_logger().getEffectiveLevel() <= logging.DEBUG
        if not isinstance(descriptors, Sequence):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            pr = ProgressReporter(self._log.debug, dbg_report_interval).start()
            for d in descriptors:
                descriptors_l.append(d)
                dbg_report and pr.increment_report()
            dbg_report and pr.report()
            descriptors = descriptors_l
        if len(descriptors[0].vector()) < self.bit_length:
            raise ValueError("Input descriptors have fewer features than "
                             "requested bit encoding. Hash codes will be "
                             "smaller than requested due to PCA decomposition "
                             "result being bound by number of features.")

        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors,
                               report_interval=dbg_report_interval,
                               use_multiprocessing=use_multiprocessing)
        self._log.debug("descriptor matrix shape: %s", x.shape)

        self._log.debug("Info normalizing descriptors by factor: %s",
                        self.normalize)
        x = self._norm_vector(x)

        self._log.info("Centering data")
        self.mean_vec = numpy.mean(x, axis=0)
        x -= self.mean_vec

        self._log.info("Computing PCA transformation")
        self._log.debug("-- computing covariance")
        # ``cov`` wants each row to be a feature and each column an observation
        # of those features. Thus, each column should be a descriptor vector,
        # thus we need the transpose here.
        c = numpy.cov(x.transpose())

        # Direct translation from UNC matlab code
        # - eigen vectors are the columns of ``pc``
        self._log.debug('-- computing linalg.eig')
        l, pc = numpy.linalg.eig(c)
        self._log.debug('-- ordering eigen vectors by descending eigen '
                        'value')

        # # Harry translation of original matlab code
        # # - Uses singular values / vectors, not eigen
        # # - singular vectors are the columns of pc
        # self._log.debug('-- computing linalg.svd')
        # pc, l, _ = numpy.linalg.svd(c)
        # self._log.debug('-- ordering singular vectors by descending '
        #                 'singular value')

        # Same ordering method for both eig/svd sources.
        l_pc_ordered = sorted(zip(l, pc.transpose()),
                              key=lambda _p: _p[0],
                              reverse=True)

        self._log.debug("-- top vector extraction")
        # Only keep the top ``bit_length`` vectors after ordering by descending
        # value magnitude.
        # - Transposing vectors back to column-vectors.
        pc_top = numpy.array([p[1] for p in l_pc_ordered[:self.bit_length]])\
            .transpose()
        self._log.debug("-- project centered data by PC matrix")
        v = numpy.dot(x, pc_top)

        self._log.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(v, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = numpy.dot(pc_top, self.rotation)

        self.save_model()

        return c
Пример #7
0
    def compute_descriptor_async(self, data_iter,
                                 descr_factory=DFLT_DESCRIPTOR_FACTORY,
                                 overwrite=False, procs=None, **kwds):
        """
        Asynchronously compute feature data for multiple data items.

        :param data_iter: Iterable of data elements to compute features for.
            These must have UIDs assigned for feature association in return
            value.
        :type data_iter: collections.Iterable[smqtk.representation.DataElement]

        :param descr_factory: Factory instance to produce the wrapping
            descriptor element instance. The default factory produces
            ``DescriptorMemoryElement`` instances by default.
        :type descr_factory: smqtk.representation.DescriptorElementFactory

        :param overwrite: Whether or not to force re-computation of a descriptor
            vectors for the given data even when there exists precomputed
            vectors in the generated DescriptorElements as generated from the
            provided factory. This will overwrite the persistently stored
            vectors if the provided factory produces a DescriptorElement
            implementation such storage.
        :type overwrite: bool

        :param procs: Optional specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type procs: int | None

        :raises ValueError: An input DataElement was of a content type that we
            cannot handle.

        :return: Mapping of input DataElement UUIDs to the computed descriptor
            element for that data. DescriptorElement UUID's are congruent with
            the UUID of the data element it is the descriptor of.
        :rtype: dict[collections.Hashable,
                     smqtk.representation.DescriptorElement]

        """
        self._set_caffe_mode()

        # Create DescriptorElement instances for each data elem.
        data_elements = {}
        descr_elements = {}
        self._log.debug("Checking content types; aggregating data/descriptor "
                        "elements.")
        pr = ProgressReporter(self._log.debug, 1.0).start()
        for data in data_iter:
            ct = data.content_type()
            if ct not in self.valid_content_types():
                self._log.error("Cannot compute descriptor from content type "
                                "'%s' data: %s)" % (ct, data))
                raise ValueError("Cannot compute descriptor from content type "
                                 "'%s' data: %s)" % (ct, data))
            data_elements[data.uuid()] = data
            descr_elements[data.uuid()] = \
                descr_factory.new_descriptor(self.name, data.uuid())
            pr.increment_report()
        pr.report()
        self._log.debug("Given %d unique data elements", len(data_elements))

        # Reduce procs down to the number of elements to process if its smaller
        if len(data_elements) < (procs or multiprocessing.cpu_count()):
            procs = len(data_elements)
        if procs == 0:
            raise ValueError("No data elements provided")

        # For thread safely, only use .append() and .popleft() (queue)
        uuid4proc = deque()

        def check_get_uuid(descriptor_elem):
            if overwrite or not descriptor_elem.has_vector():
                uuid4proc.append(descriptor_elem.uuid())

        # Using thread-pool due to in-line function + updating local deque
        p = multiprocessing.pool.ThreadPool(procs)
        try:
            p.map(check_get_uuid, six.itervalues(descr_elements))
        finally:
            p.close()
            p.join()
        del p
        self._log.debug("%d descriptors already computed",
                        len(data_elements) - len(uuid4proc))

        if uuid4proc:
            self._log.debug("Converting deque to tuple for segmentation")
            uuid4proc = tuple(uuid4proc)

            # Split UUIDs into groups equal to our batch size, and an option
            # tail group that is less than our batch size.
            tail_size = len(uuid4proc) % self.batch_size
            batch_groups = (len(uuid4proc) - tail_size) // self.batch_size
            self._log.debug("Processing %d batches of size %d", batch_groups,
                            self.batch_size)
            if tail_size:
                self._log.debug("Processing tail group of size %d", tail_size)

            if batch_groups:
                for g in range(batch_groups):
                    self._log.debug("Starting batch: %d of %d",
                                    g + 1, batch_groups)
                    batch_uuids = \
                        uuid4proc[g * self.batch_size:(g + 1) * self.batch_size]
                    self._process_batch(batch_uuids, data_elements,
                                        descr_elements, procs,
                                        kwds.get('use_mp', True))

            if tail_size:
                batch_uuids = uuid4proc[-tail_size:]
                self._log.debug("Starting tail batch (size=%d)",
                                len(batch_uuids))
                self._process_batch(batch_uuids, data_elements, descr_elements,
                                    procs, kwds.get('use_mp', True))

        self._log.debug("forming output dict")
        return dict((data_elements[k].uuid(), descr_elements[k])
                    for k in data_elements)
Пример #8
0
            except elasticsearch.ConnectionTimeout, ex:
                log.warning("ElasticSearch timed out (error = %s)", str(ex))
                restart = True
                log.debug("Restarting query from index %d", i)

    log.info("Initializing image download/record parallel iterator")
    img_dl_records = parallel_map(dl_image,
                                  iter_scan_meta(),
                                  name='image_download',
                                  use_multiprocessing=True,
                                  cores=cores)

    # Write out
    log.info("Starting iteration/file-write")
    with open(scan_record, 'w') as record_file:
        pr = ProgressReporter(log.debug, 1.0).start()
        for r in img_dl_records:
            if r is not None:
                cdr_id, local_path, uuid = r
                record_file.write('%s,%s,%s\n' % (cdr_id, local_path, uuid))
            pr.increment_report()
        pr.report()


def default_config():
    return {
        "image_types": ['jpeg', 'png', 'tiff'],
        "elastic_search": {
            "instance_address": "CHANGEME",
            "index": "CHANGEME",
            "username": "******",