Exemplo n.º 1
0
    def get_many_vectors(self, uuids):
        """
        Get underlying vectors of descriptors associated with given uuids.

        :param uuids: Iterable of descriptor UUIDs to query for.
        :type uuids: collections.Iterable[collections.Hashable]

        :return: Iterator of vectors for descriptors associated with given uuid
            values.
        :rtype: collections.Iterable[smqtk.representation.DescriptorElement]

        """
        DescriptorElement.get_many_vectors(self.get_many_descriptors(uuids))
Exemplo n.º 2
0
    def from_config(cls, config_dict, merge_default=True):
        """
        Instantiate a new instance of this class given the configuration
        JSON-compliant dictionary encapsulating initialization arguments.

        This method should not be called via super unless and instance of the
        class is desired.

        :param config_dict: JSON compliant dictionary encapsulating
            a configuration.
        :type config_dict: dict

        :param merge_default: Merge the given configuration on top of the
            default provided by ``get_default_config``.
        :type merge_default: bool

        :return: Constructed instance from the provided config.
        :rtype: DescriptorElementFactory

        """
        if merge_default:
            config_dict = merge_dict(cls.get_default_config(), config_dict)

        de_type, de_conf = cls_conf_from_config_dict(
            config_dict, DescriptorElement.get_impls())
        return DescriptorElementFactory(de_type, de_conf)
Exemplo n.º 3
0
        def iter_tocompute_arrays():
            """ Yield descriptor vectors for classification elements that need
            computing yet.

            :rtype: typing.Generator[numpy.ndarray]
            """
            # Force into an iterator.
            descr_iterator = iter(descr_iter)
            # Running var for the index of final data element in input
            # iterator. This will be -1 or the value of the final index in the
            # parallel lists.
            last_i = -1
            # Make successive islices into iterator of descriptor elements to
            # produces batches. We end when there is nothing left being
            # returned by the iterator
            de_batch_list = \
                list(itertools.islice(descr_iterator, d_elem_batch))
            while de_batch_list:
                # Get vectors from batch using implementation-level batch
                # aggregation methods where applicable.
                de_batch_vecs = \
                    DescriptorElement.get_many_vectors(de_batch_list)

                for d_elem, d_vec in zip(de_batch_list, de_batch_vecs):
                    d_uid = d_elem.uuid()
                    if d_vec is None:
                        raise ValueError(
                            "Encountered DescriptorElement with "
                            "no vector stored! (UID=`{}`)".format(d_uid))
                    c_elem_ = factory.new_classification(self.name, d_uid)
                    already_computed = \
                        not overwrite and c_elem_.has_classifications()
                    elem_and_status_q.append((c_elem_, already_computed))
                    if not already_computed:
                        # Classifications should be computed for this
                        # descriptor
                        log_debug(
                            "Yielding descriptor array with UID `{}` "
                            "for classification generation.".format(d_uid))
                        yield d_vec
                    else:
                        log_debug("Classification already generated for UID "
                                  "`{}`.".format(d_uid))

                last_i += len(de_batch_vecs)

                # Slice out the next batch of descriptor elements. This will be
                # empty if the iterator has been exhausted.
                de_batch_list = list(
                    itertools.islice(descr_iterator, d_elem_batch))

            end_of_iter[0] = last_i
Exemplo n.º 4
0
    def get_default_config(cls):
        """
        Generate and return a default configuration dictionary for this class.
        This will be primarily used for generating what the configuration
        dictionary would look like for this class without instantiating it.

        It is not be guaranteed that the configuration dictionary returned
        from this method is valid for construction of an instance of this class.

        :return: Default configuration dictionary for the class.
        :rtype: dict

        """
        return make_default_config(DescriptorElement.get_impls())
Exemplo n.º 5
0
 def _train(self, class_examples, **extra_params):
     # convert descriptor elements into combines ndarray with associated
     # label vector.
     vec_list = []
     label_list = []
     for label, examples in class_examples.items():
         label_vectors = \
             DescriptorElement.get_many_vectors(examples)
         # ``is`` or ``count`` method messes up when elements are np arrays.
         none_count = len([e for e in label_vectors if e is None])
         assert none_count == 0, \
             "Some descriptor elements for label {} did not contain " \
             "vectors! (n={})".format(label, none_count)
         vec_list.extend(label_vectors)
         label_list.extend([label] * len(label_vectors))
     vec_list = np.vstack(vec_list)
     self.fit(vec_list, label_list)
Exemplo n.º 6
0
 def build_index(self, descriptors):
     # Cache given descriptor element vectors into a matrix for use during
     # ``rank``.
     descr_elem_list = list(descriptors)
     if len(descr_elem_list) == 0:
         raise ValueError("No descriptor elements passed.")
     # note: this fails if multiple descriptor elements with the same UID
     #       are included. There will be None's present.
     descr_matrix = np.asarray(
         DescriptorElement.get_many_vectors(descr_elem_list))
     # If the result matrix is of dtype(object), then either some elements
     # did not have vectors or some vectors were not of congruent
     # dimensionality.
     if descr_matrix.dtype == np.dtype(object):
         raise ValueError("One or more descriptor elements did not have a "
                          "vector set or were of congruent dimensionality.")
     self._descr_elem_list = descr_elem_list
     self._descr_matrix = descr_matrix
Exemplo n.º 7
0
    def get_many_vectors(self, uuids):
        """
        Get underlying vectors of descriptors associated with given uuids.

        :param uuids: Iterable of descriptor UUIDs to query for.
        :type uuids: collections.abc.Iterable[collections.abc.Hashable]

        :raises: KeyError: When there is not a descriptor in this set for one
            or more input UIDs.

        :return: List of vectors for descriptors associated with given uuid
            values.
        :rtype: list[numpy.ndarray | None]

        """
        return DescriptorElement.get_many_vectors(
            self.get_many_descriptors(uuids)
        )
Exemplo n.º 8
0
    def get_default_config(cls):
        """
        Generate and return a default configuration dictionary for this class.
        This will be primarily used for generating what the configuration
        dictionary would look like for this class without instantiating it.

        By default, we observe what this class's constructor takes as arguments,
        aside from the first two assumed positional arguments, turning those
        argument names into configuration dictionary keys.
        If any of those arguments have defaults, we will add those values into
        the configuration dictionary appropriately.
        The dictionary returned should only contain JSON compliant value types.

        It is not be guaranteed that the configuration dictionary returned
        from this method is valid for construction of an instance of this class.

        :return: Default configuration dictionary for the class.
        :rtype: dict

        """
        c = super(CachingDescriptorElement, cls).get_default_config()

        # Nested DescriptorElementFactory configuration
        if c['wrapped_element_factory'] is None:
            # Have to make this configuration in such a way that we don't
            # include ourselves in the list of nestable classes else an infinite
            # recursion will occur.

            de_impls = DescriptorElement.get_impls()
            # Remove ourselves
            de_impls.remove(cls)

            # Construct config block DescriptorElementFactory wants
            c['wrapped_element_factory'] = make_default_config(de_impls)
        else:
            c['wrapped_element_factory'] = \
                c['wrapped_element_factory'].get_config()

        return c
Exemplo n.º 9
0
    def _train(self, class_examples, **extra_params):
        """
        Internal method that trains the classifier implementation.

        This method is called after checking that there is not already a model
        trained, thus it can be assumed that no model currently exists.

        The class labels will have already been checked before entering this
        method, so it can be assumed that the ``class_examples`` will container
        at least two classes.

        :param class_examples: Dictionary mapping class labels to iterables of
            DescriptorElement training examples.
        :type class_examples: dict[collections.abc.Hashable,
                 collections.abc.Iterable[smqtk.representation.DescriptorElement]]

        :param extra_params: Dictionary with extra parameters for training.
            This is not used by this implementation.
        :type extra_params: None | dict[basestring, object]

        """

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.abc.Sequence):
                self._log.debug('   (expanding iterable into sequence)')
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = numpy.array(DescriptorElement.get_many_vectors(g))
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count mismatch between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights if set to C-SVC type SVM
        if '-s' not in params or int(params['-s']) == 0:
            # (john.moeller): The weighting should probably be the geometric
            # mean of the number of examples over the classes divided by the
            # number of examples for the current class.
            gmean = scipy.stats.gmean(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                w = gmean / n
                params['-w' + str(i)] = w
                self._log.debug("-- class '%s' weight: %s",
                                self.svm_label_map[i], w)

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        del train_vectors
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_elem and self.svm_label_map_elem.writable():
            self._log.debug("saving labels to element (%s)",
                            self.svm_label_map_elem)
            self.svm_label_map_elem.set_bytes(
                pickle.dumps(self.svm_label_map, -1)
            )
        if self.svm_model_elem and self.svm_model_elem.writable():
            self._log.debug("saving model to element (%s)",
                            self.svm_model_elem)
            # LibSvm I/O only works with filepaths, thus the need for an
            # intermediate temporary file.
            fd, fp = tempfile.mkstemp()
            try:
                svmutil.svm_save_model(fp, self.svm_model)
                # Use the file descriptor to create the file object.
                # This avoids reopening the file and will automatically
                # close the file descriptor on exiting the with block.
                # fdopen() is required because in Python 2 open() does
                # not accept a file descriptor.
                with os.fdopen(fd, 'rb') as f:
                    self.svm_model_elem.set_bytes(f.read())
            finally:
                os.remove(fp)