예제 #1
0
    def del_classifier(self):
        """
        Remove a classifier by the given label.

        Form args:
            label
                Label of the classifier to remove.

        Possible error codes:
            400
                No classifier exists for the given label.

        Returns 200.

        """
        label = flask.request.values.get('label', default=None)
        if label is None or not label:
            return make_response_json("No label provided.", 400)
        elif label not in self.classifier_collection.labels():
            return make_response_json("Label '%s' does not refer to a"
                                      " classifier currently registered."
                                      % label,
                                      404,
                                      label=label)
        elif label in self.immutable_labels:
            return make_response_json("Label '%s' refers to a classifier"
                                      " that is immutable." % label,
                                      405,
                                      label=label)

        self.classifier_collection.remove_classifier(label)

        return make_response_json("Removed classifier with label '%s'."
                                  % label,
                                  removed_label=label)
예제 #2
0
    def del_classifier(self):
        """
        Remove a classifier by the given label.

        Form args:
            label
                Label of the classifier to remove.

        Possible error codes:
            400
                No classifier exists for the given label.

        Returns 200.

        """
        label = flask.request.values.get('label', default=None)
        if label is None or not label:
            return make_response_json("No label provided.", 400)
        elif label not in self.classifier_collection.labels():
            return make_response_json("Label '%s' does not refer to a"
                                      " classifier currently registered." %
                                      label,
                                      404,
                                      label=label)
        elif label in self.immutable_labels:
            return make_response_json("Label '%s' refers to a classifier"
                                      " that is immutable." % label,
                                      405,
                                      label=label)

        self.classifier_collection.remove_classifier(label)

        return make_response_json("Removed classifier with label '%s'." %
                                  label,
                                  removed_label=label)
예제 #3
0
    def get_classifier_metadata(self):
        """
        Get metadata associated with a specific classifier instance referred to
        by label.

        URL Arguments:
            label
                Reference label for a specific classifier to query.

        Returns code 200 on success and the JSON return object: {
            ...
            // Sequence of class labels that this classifier can classify
            // descriptors into.  This includes the negative label.
            class_labels=<list[str]>
        }

        """
        label = flask.request.values.get('label', default=None)
        if label is None or not label:
            return make_response_json("No label provided.", return_code=400)
        elif label not in self.classifier_collection.labels():
            return make_response_json("Label '%s' does not refer to a "
                                      "classifier currently registered."
                                      % label,
                                      return_code=404,
                                      label=label)
        class_labels = \
            self.classifier_collection.get_classifier(label).get_labels()
        return make_response_json("Success", return_code=200,
                                  class_labels=class_labels)
예제 #4
0
    def get_classifier_metadata(self):
        """
        Get metadata associated with a specific classifier instance referred to
        by label.

        URL Arguments:
            label
                Reference label for a specific classifier to query.

        Returns code 200 on success and the JSON return object: {
            ...
            // Sequence of class labels that this classifier can classify
            // descriptors into.  This includes the negative label.
            class_labels=<list[str]>
        }

        """
        label = flask.request.values.get('label', default=None)
        if label is None or not label:
            return make_response_json("No label provided.", return_code=400)
        elif label not in self.classifier_collection.labels():
            return make_response_json("Label '%s' does not refer to a "
                                      "classifier currently registered." %
                                      label,
                                      return_code=404,
                                      label=label)
        class_labels = \
            self.classifier_collection.get_classifier(label).get_labels()
        return make_response_json("Success",
                                  return_code=200,
                                  class_labels=class_labels)
예제 #5
0
    def get_classifier(self):
        """
        Download the classifier corresponding to the provided label, pickled
        and encoded in standard base64 encoding.

        Below is an example call to this endpoint via the ``requests`` python
        module::

            import base64
            import requests
            from six.moves import cPickle as pickle

            r = requests.get('http://localhost:5000/classifier',
                             data={'label': 'some_label'})
            data_bytes = base64.b64decode(r.content)
            classifier = pickle.loads(data_bytes)

        With curl on the command line::

            $ curl -X GET localhost:5000/classifier -d label=some_label | \
                base64 -d > /path/to/file.pkl

        Data args:
            label
                Label of the requested classifier

        Possible error codes:
            400
                No label provided
            404
                Label does not refer to a registered classifier

        Returns: The pickled and encoded classifier
        """
        label = flask.request.values.get('label', default=None)
        if label is None or not label:
            return make_response_json("No label provided.", 400)
        elif label not in self.classifier_collection.labels():
            return make_response_json("Label '%s' does not refer to a "
                                      "classifier currently registered." %
                                      label,
                                      404,
                                      label=label)

        clfr = self.classifier_collection.get_classifier(label)

        try:
            return base64.b64encode(pickle.dumps(clfr)), 200
        except pickle.PicklingError:
            return make_response_json("Classifier corresponding to label "
                                      "'%s' cannot be pickled." % label,
                                      500,
                                      label=label)
예제 #6
0
    def get_classifier(self):
        """
        Download the classifier corresponding to the provided label, pickled
        and encoded in standard base64 encoding.

        Below is an example call to this endpoint via the ``requests`` python
        module::

            import base64
            import requests
            from six.moves import cPickle as pickle

            r = requests.get('http://localhost:5000/classifier',
                             data={'label': 'some_label'})
            data_bytes = base64.b64decode(r.content)
            classifier = pickle.loads(data_bytes)

        With curl on the command line::

            $ curl -X GET localhost:5000/classifier -d label=some_label | \
                base64 -d > /path/to/file.pkl

        Data args:
            label
                Label of the requested classifier

        Possible error codes:
            400
                No label provided
            404
                Label does not refer to a registered classifier

        Returns: The pickled and encoded classifier
        """
        label = flask.request.values.get('label', default=None)
        if label is None or not label:
            return make_response_json("No label provided.", 400)
        elif label not in self.classifier_collection.labels():
            return make_response_json("Label '%s' does not refer to a "
                                      "classifier currently registered."
                                      % label,
                                      404,
                                      label=label)

        clfr = self.classifier_collection.get_classifier(label)

        try:
            return base64.b64encode(pickle.dumps(clfr)), 200
        except pickle.PicklingError:
            return make_response_json("Classifier corresponding to label "
                                      "'%s' cannot be pickled." % label,
                                      500,
                                      label=label)
예제 #7
0
    def get_classifier_labels(self):
        """
        Get the descriptive labels of the classifiers currently set to
        classify input data.

        Returns 200: {
            labels: list[str]
        }

        """
        all_labels = self.classifier_collection.labels()
        return make_response_json("Classifier labels.",
                                  labels=list(all_labels))
예제 #8
0
    def get_classifier_labels(self):
        """
        Get the descriptive labels of the classifiers currently set to
        classify input data.

        Returns 200: {
            labels: list[str]
        }

        """
        all_labels = self.classifier_collection.labels()
        return make_response_json("Classifier labels.",
                                  labels=list(all_labels))
예제 #9
0
    def classify_uids(self):
        """
        Given a list of descriptor UIDs, we attempt to retrieve descriptor
        vectors from our configured descriptor index and classify those vectors
        against all currently stored classifiers (optionally a list of
        requested classifiers), returning a map of classifier descriptive
        labels to their class-to-probability results.

        TODO: Add `adjustment` param from POST /classify

        Arguments
            uid_list:
                JSON list of UIDs of descriptors in the configured descriptor
                set to classify. These should be strings or integers depending
                on the keys used in the configured descriptor set (usually
                strings).
            label:
                Optional string label or JSON list of string labels defining
                specific classifiers to use for inferencing against the
                descriptors.

        Possible error codes:
            400
                No UIDs provided, or provided labels are malformed.
            404
                Label or labels provided do not match any registered
                classifier

        Returns: {
            ...
            result: {
                classifier-label: {
                    class-label: prob,
                    ...
                },
                ...
            }
        }

        """
        if self.classifier_collection.size() == 0:
            return make_response_json("No classifiers currently loaded.", 200,
                                      result={})

        uid_list = flask.request.values.get('uid_list')
        if uid_list is None:
            return make_response_json("No UIDs provided.", 400)
        try:
            uid_list = json.loads(uid_list)
        except json.JSONDecodeError:
            return make_response_json("Failed to parse JSON list of UIDs.",
                                      400)
        if not uid_list:
            return make_response_json("No UIDs provided.", 400)

        try:
            # We could technically pass `labels_from_input` as the `type=`
            # value in `.get()` but the method eats ValueErrors raised.
            labels = labels_from_input(
                flask.request.values.get('label', default=None))
        except ValueError as ex:
            return make_response_json(f"Invalid label(s) specified: {ex}", 400)

        # Label list has been parsed at this point. Make sure its contents
        # meshes with available classifiers before retrieving descriptors.
        if labels is not None:
            missing_labels = (set(labels) -
                              set(self.classifier_collection.labels()))
            if missing_labels:
                return make_response_json(
                    "The following labels are not registered with any "
                    "classifiers: " + ", ".join(map(repr, missing_labels)),
                    404,
                    missing_labels=list(missing_labels))

        try:
            vec_list = self.descriptor_set.get_many_vectors(uid_list)
        except KeyError:
            return make_response_json("One or more input UIDs did not exist in"
                                      " the configured descriptor set!", 400)
        pred_map = self.classifier_collection.classify_arrays(
            vec_list, labels=labels
        )

        # TODO: Add `adjustment` functionality like from `POST /classify`.

        return make_response_json("", 200, result=pred_map)
예제 #10
0
    def add_classifier(self):
        """
        Upload a **trained** classifier pickled and encoded in standard base64
        encoding, matched with a descriptive label of that classifier's topic.

        Since all classifiers have only two result classes (positive and
        negative), the topic of the classifier is encoded in the descriptive
        label the user applies to the classifier.

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.post('http://localhost:5000/classifier',
                          data={'bytes_b64': base64.b64encode(data_bytes),
                                'label': 'some_label'})

        With curl on the command line::

            $ curl -X POST localhost:5000/classifier -d label=some_label \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file.pkl > /path/to/file.pkl.b64
            $ curl -X POST localhost:5000/classifier -d label=some_label \
                --data-urlencode bytes_64@/path/to/file.pkl.b64

        To lock this classifier and guard it against deletion, add
        "lock_label=true"::

            $ curl -X POST localhost:5000/classifier \
                -d "label=some_label" \
                -d "lock_label=true" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file.pkl)"

        Data/Form arguments:
            bytes_b64
                Bytes, in the standard base64 encoding, of the pickled
                classifier.
            label
                Descriptive label to apply to this classifier. This should not
                conflict with existing classifier labels.
            lock_label
                If 'true', disallow deletion of this label. If 'false', allow
                deletion of this label. Only has an effect if deletion is
                enabled for this service. (Default: 'false')

        Possible error codes:
            400
                May mean one of:
                    - No pickled classifier base64 data or label provided.
                    - Label provided is in conflict with an existing label in
                    the classifier collection.

        Returns code 201 on success and the message: {
            label: <str>
        }

        """
        clfr_b64 = flask.request.values.get('bytes_b64', default=None)
        label = flask.request.values.get('label', default=None)
        lock_clfr_str = flask.request.values.get('lock_label', default='false')

        if clfr_b64 is None or len(clfr_b64) == 0:
            return make_response_json("No state base64 data provided.", 400)
        elif label is None or len(label) == 0:
            return make_response_json("No descriptive label provided.", 400)
        try:
            # This can throw a ValueError if lock_clfr is malformed JSON
            lock_clfr = bool(flask.json.loads(lock_clfr_str))
        except JSON_DECODE_EXCEPTION:
            return make_response_json(
                "Invalid boolean value for"
                " 'lock_label'. Was given: '%s'" % lock_clfr_str, 400)

        # If the given label conflicts with one already in the collection,
        # fail.
        if label in self.classifier_collection.labels():
            return make_response_json("Label '%s' already exists in"
                                      " classifier collection." % label,
                                      400,
                                      label=label)

        clfr = pickle.loads(base64.b64decode(clfr_b64.encode('utf-8')))

        try:
            self.classifier_collection.add_classifier(label, clfr)

            # If we're allowing deletions, get the lock flag from the form
            # and set it for this classifier
            if self.enable_classifier_removal and lock_clfr:
                self.immutable_labels.add(label)

        except ValueError:
            return make_response_json("Data added for label '%s' is not a"
                                      " Classifier." % label,
                                      400,
                                      label=label)

        return make_response_json("Uploaded classifier for label '%s'." %
                                  label,
                                  201,
                                  label=label)
예제 #11
0
    def add_iqr_state_classifier(self):
        """
        Train a classifier based on the user-provided IQR state file bytes in
        a base64 encoding, matched with a descriptive label of that
        classifier's topic.

        Since all IQR session classifiers end up only having two result
        classes (positive and negative), the topic of the classifier is
        encoded in the descriptive label the user applies to the classifier.

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.get('http://localhost:5000/iqr_classifier',
                         data={'bytes_b64': base64.b64encode(data_bytes),
                               'label': 'some_label'})

        With curl on the command line::

            $ curl -X POST localhost:5000/iqr_classifier \
                -d "label=some_label" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/iqr_classifier -d label=some_label \
                --data-urlencode bytes_64@/path/to/file.b64

        To lock this classifier and guard it against deletion, add
        "lock_label=true"::

            $ curl -X POST localhost:5000/iqr_classifier \
                -d "label=some_label" \
                -d "lock_label=true" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

        Form arguments:
            iqr_state_b64
                base64 encoding of the bytes of the IQR session state save
                file.
            label
                Descriptive label to apply to this classifier. This should not
                conflict with existing classifier labels.
            lock_label
                If 'true', disallow deletion of this label. If 'false', allow
                deletion of this label. Only has an effect if deletion is
                enabled for this service. (Default: 'false')

        Returns 201.

        """
        data_b64 = flask.request.values.get('bytes_b64', default=None)
        label = flask.request.values.get('label', default=None)
        lock_clfr_str = flask.request.values.get('lock_label', default='false')

        if data_b64 is None or len(data_b64) == 0:
            return make_response_json("No state base64 data provided.", 400)
        elif label is None or len(label) == 0:
            return make_response_json("No descriptive label provided.", 400)
        try:
            lock_clfr = bool(flask.json.loads(lock_clfr_str))
        except JSON_DECODE_EXCEPTION:
            return make_response_json(
                "Invalid boolean value for"
                " 'lock_label'. Was given: '%s'" % lock_clfr_str, 400)
        try:
            # Using urlsafe version because it handles both regular and urlsafe
            # alphabets.
            data_bytes = base64.urlsafe_b64decode(data_b64.encode('utf-8'))
        except (TypeError, binascii.Error) as ex:
            return make_response_json("Invalid base64 input: %s" % str(ex)), \
                   400

        # If the given label conflicts with one already in the collection,
        # fail.
        if label in self.classifier_collection.labels():
            return make_response_json(
                "Label already exists in classifier collection.", 400)

        # Create dummy IqrSession to extract pos/neg descriptors.
        iqrs = IqrSession()
        iqrs.set_state_bytes(data_bytes, self.descriptor_factory)
        pos = iqrs.positive_descriptors | iqrs.external_positive_descriptors
        neg = iqrs.negative_descriptors | iqrs.external_negative_descriptors
        del iqrs

        # Make a classifier instance from the stored config for IQR
        # session-based classifiers.
        #: :type: SupervisedClassifier
        classifier = from_config_dict(self.iqr_state_classifier_config,
                                      SupervisedClassifier.get_impls())
        classifier.train(class_examples={'positive': pos, 'negative': neg})

        try:
            self.classifier_collection.add_classifier(label, classifier)

            # If we're allowing deletions, get the lock flag from the form and
            # set it for this classifier
            if self.enable_classifier_removal and lock_clfr:
                self.immutable_labels.add(label)

        except ValueError as e:
            if e.args[0].find('JSON') > -1:
                return make_response_json(
                    "Tried to parse malformed JSON in "
                    "form argument.", 400)
            return make_response_json("Duplicate label ('%s') added during "
                                      "classifier training of provided IQR "
                                      "session state." % label,
                                      400,
                                      label=label)

        return make_response_json("Finished training IQR-session-based "
                                  "classifier for label '%s'." % label,
                                  201,
                                  label=label)
예제 #12
0
    def classify(self):
        """
        Given a file's bytes (standard base64-format) and content mimetype,
        describe and classify the content against all currently stored
        classifiers (optionally a list of requested classifiers), returning a
        map of classifier descriptive labels to their class-to-probability
        results.

        We expect the data to be transmitted in the body of the request in
        standard base64 encoding form ("bytes_b64" key). We look for the
        content type either as URL parameter or within the body
        ("content_type" key).

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.post('http://localhost:5000/classify',
                          data={'bytes_b64': base64.b64encode(data_bytes),
                                'content_type': 'text/plain'})

        With curl on the command line::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode bytes_64@/path/to/file.b64

        Optionally, the `label` parameter can be provided to limit the results
        of classification to a set of classifiers::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode bytes_64@/path/to/file.b64

        Data/Form arguments:
            bytes_b64
                Bytes in the standard base64 encoding to be described and
                classified.
            content_type
                The mimetype of the sent data.
            label
                (Optional) JSON-encoded label or list of labels
            adjustment
                (Optional) JSON-encoded dictionary of labels to floats. Higher
                values lower the gain on the class and therefore correspond to
                higher precision (and lower recall) for the class (and higher
                recall/lower precision for other classes). This translates
                to calling ``smqtk.utils.probability.adjust_proba``.

        Possible error codes:
            400
                No bytes provided, or provided labels are malformed
            404
                Label or labels provided do not match any registered
                classifier

        Returns: {
            ...
            result: {
                classifier-label: {
                    class-label: prob,
                    ...
                },
                ...
            }
        }

        """
        data_b64 = flask.request.values.get('bytes_b64', default=None)
        content_type = flask.request.values.get('content_type', default=None)
        label_str = flask.request.values.get('label', default=None)
        adjustment_str = flask.request.values.get('adjustment', default=None)

        labels = None
        if label_str is not None:
            try:
                labels = flask.json.loads(label_str)

                if isinstance(labels, six.string_types):
                    labels = [labels]
                elif isinstance(labels, list):
                    for el in labels:
                        if not isinstance(el, six.string_types):
                            return make_response_json(
                                "Label must be a list of strings or a"
                                " single string.", 400)
                else:
                    return make_response_json(
                        "Label must be a list of strings or a single"
                        " string.", 400)

            except JSON_DECODE_EXCEPTION:
                # Unquoted strings aren't valid JSON. That is, a plain string
                # needs to be passed as '"label"' rather than just 'label' or
                # "label". However, we can be a bit more generous and just
                # allow such a string, but we have to place *some* restriction
                # on it. We use `urllib.quote` for this since essentially it
                # just checks to make sure that the string is made up of one
                # of the following types of characters:
                #
                #   - letters
                #   - numbers
                #   - spaces, underscores, periods, and dashes
                #
                # Since the concept of a "letter" is fraught with encoding and
                # locality issues, we simply let urllib make this decision for
                # us.

                # If label_str matches the url-encoded version of itself, go
                # ahead and use it
                if urllib.parse.quote(label_str, safe='') == label_str:
                    labels = [label_str]
                else:
                    return make_response_json(
                        "Label(s) are not properly formatted JSON.", 400)

        # Collect optional result probability adjustment values
        #: :type: dict[collections.Hashable, float]
        adjustments = {}
        if adjustment_str is not None:
            try:
                #: :type: dict[collections.Hashable, float]
                adjustments = flask.json.loads(adjustment_str)

                for label, val in six.iteritems(adjustments):
                    if not isinstance(label, six.string_types):
                        return make_response_json(
                            "Adjustment label '%s' is not a string type." %
                            label, 400)
                    if not isinstance(val, (int, float)):
                        return make_response_json(
                            "Adjustment value %s for label '%s' is not an int "
                            "or float" % (val, label), 400)
            except JSON_DECODE_EXCEPTION:
                return make_response_json(
                    "Adjustment(s) are not properly formatted JSON.", 400)

        if data_b64 is None:
            return make_response_json("No base-64 bytes provided.", 400)
        elif content_type is None:
            return make_response_json("No content type provided.", 400)

        data_bytes = base64.b64decode(data_b64.encode('utf-8'))
        self._log.debug("Length of byte data: %d" % len(data_bytes))

        data_elem = DataMemoryElement(data_bytes, content_type, readonly=True)
        descr_elem = self.descriptor_gen.generate_one_element(
            data_elem, descr_factory=self.descriptor_factory)
        self._log.debug("Descriptor shape: %s", descr_elem.vector().shape)

        try:
            clfr_map = self.classifier_collection.classify(
                descr_elem, labels=labels, factory=self.classification_factory)
        except MissingLabelError as ex:
            return make_response_json(
                "The following labels are not registered with any"
                " classifiers: '%s'" % "', '".join(ex.labels),
                404,
                missing_labels=list(ex.labels))

        # Transform classification result into JSON
        c_json = {}
        for classifier_label, c_elem in six.iteritems(clfr_map):
            prediction = c_elem.get_classification()
            if adjustments:
                proba_labels = list(prediction.keys())
                proba = [prediction[k] for k in proba_labels]
                # Use opposite of adjustments, because we already set the
                # convention of "higher: precision, lower: recall"
                adj = [-adjustments.get(label, 0.0) for label in proba_labels]
                adj_proba = probability.adjust_proba(proba, adj)
                prediction = dict(zip(proba_labels, adj_proba[0]))
            c_json[classifier_label] = prediction

        return make_response_json('Finished classification.', result=c_json)
예제 #13
0
 def is_ready(self):
     """
     Simple endpoint that just means this server is up and responding.
     """
     return make_response_json("Yes, I'm alive!")
예제 #14
0
    def classify(self):
        """
        Given a file's bytes (standard base64-format) and content mimetype,
        describe and classify the content against all currently stored
        classifiers (optionally a list of requested classifiers), returning a
        map of classifier descriptive labels to their class-to-probability
        results.

        We expect the data to be transmitted in the body of the request in
        standard base64 encoding form ("bytes_b64" key). We look for the
        content type either as URL parameter or within the body
        ("content_type" key).

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.post('http://localhost:5000/classify',
                          data={'bytes_b64': base64.b64encode(data_bytes),
                                'content_type': 'text/plain'})

        With curl on the command line::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode bytes_64@/path/to/file.b64

        Optionally, the `label` parameter can be provided to limit the results
        of classification to a set of classifiers::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode bytes_64@/path/to/file.b64

        Data/Form arguments:
            bytes_b64
                Bytes in the standard base64 encoding to be described and
                classified.
            content_type
                The mimetype of the sent data.
            label
                (Optional) JSON-encoded label or list of labels
            adjustment
                (Optional) JSON-encoded dictionary of labels to floats. Higher
                values lower the gain on the class and therefore correspond to
                higher precision (and lower recall) for the class (and higher
                recall/lower precision for other classes). This translates
                to calling ``smqtk.utils.probability.adjust_proba``.

        Possible error codes:
            400
                No bytes provided, or provided labels are malformed
            404
                Label or labels provided do not match any registered
                classifier

        Returns: {
            ...
            result: {
                classifier-label: {
                    class-label: prob,
                    ...
                },
                ...
            }
        }

        """
        data_b64 = flask.request.values.get('bytes_b64', default=None)
        content_type = flask.request.values.get('content_type', default=None)
        label_str = flask.request.values.get('label', default=None)
        adjustment_str = flask.request.values.get('adjustment', default=None)

        try:
            labels = labels_from_input(label_str)
        except ValueError as ex:
            return make_response_json(f"Invalid label(s) specified: {ex}", 400)

        # Collect optional result probability adjustment values
        #: :type: dict[collections.abc.Hashable, float]
        adjustments = {}
        if adjustment_str is not None:
            try:
                #: :type: dict[collections.abc.Hashable, float]
                adjustments = json.loads(adjustment_str)

                for label, val in six.iteritems(adjustments):
                    if not isinstance(label, six.string_types):
                        return make_response_json(
                            "Adjustment label '%s' is not a string type."
                            % label,
                            400)
                    if not isinstance(val, (int, float)):
                        return make_response_json(
                            "Adjustment value %s for label '%s' is not an int "
                            "or float" % (val, label),
                            400)
            except json.JSONDecodeError:
                return make_response_json(
                    "Adjustment(s) are not properly formatted JSON.", 400)

        if data_b64 is None:
            return make_response_json("No base-64 bytes provided.", 400)
        elif content_type is None:
            return make_response_json("No content type provided.", 400)

        data_bytes = base64.b64decode(data_b64.encode('utf-8'))
        self._log.debug("Length of byte data: %d" % len(data_bytes))

        data_elem = DataMemoryElement(data_bytes, content_type, readonly=True)
        descr_elem = self.descriptor_gen.generate_one_element(
            data_elem, descr_factory=self.descriptor_factory
        )
        self._log.debug("Descriptor shape: %s", descr_elem.vector().shape)

        try:
            clfr_map = self.classifier_collection.classify(
                descr_elem, labels=labels,
                factory=self.classification_factory)
        except MissingLabelError as ex:
            return make_response_json(
                "The following labels are not registered with any"
                " classifiers: '%s'"
                % "', '".join(ex.labels),
                404,
                missing_labels=list(ex.labels))

        # Transform classification result into JSON
        c_json = {}
        for classifier_label, c_elem in six.iteritems(clfr_map):
            prediction = c_elem.get_classification()
            if adjustments:
                proba_labels = list(prediction.keys())
                proba = [prediction[k] for k in proba_labels]
                # Use opposite of adjustments, because we already set the
                # convention of "higher: precision, lower: recall"
                adj = [-adjustments.get(label, 0.0) for label in proba_labels]
                adj_proba = probability.adjust_proba(proba, adj)
                prediction = dict(zip(proba_labels, adj_proba[0]))
            c_json[classifier_label] = prediction

        return make_response_json('Finished classification.',
                                  result=c_json)
예제 #15
0
    def add_classifier(self):
        """
        Upload a **trained** classifier pickled and encoded in standard base64
        encoding, matched with a descriptive label of that classifier's topic.

        Since all classifiers have only two result classes (positive and
        negative), the topic of the classifier is encoded in the descriptive
        label the user applies to the classifier.

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.post('http://localhost:5000/classifier',
                          data={'bytes_b64': base64.b64encode(data_bytes),
                                'label': 'some_label'})

        With curl on the command line::

            $ curl -X POST localhost:5000/classifier -d label=some_label \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file.pkl > /path/to/file.pkl.b64
            $ curl -X POST localhost:5000/classifier -d label=some_label \
                --data-urlencode bytes_64@/path/to/file.pkl.b64

        To lock this classifier and guard it against deletion, add
        "lock_label=true"::

            $ curl -X POST localhost:5000/classifier \
                -d "label=some_label" \
                -d "lock_label=true" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file.pkl)"

        Data/Form arguments:
            bytes_b64
                Bytes, in the standard base64 encoding, of the pickled
                classifier.
            label
                Descriptive label to apply to this classifier. This should not
                conflict with existing classifier labels.
            lock_label
                If 'true', disallow deletion of this label. If 'false', allow
                deletion of this label. Only has an effect if deletion is
                enabled for this service. (Default: 'false')

        Possible error codes:
            400
                May mean one of:
                    - No pickled classifier base64 data or label provided.
                    - Label provided is in conflict with an existing label in
                    the classifier collection.

        Returns code 201 on success and the message: {
            label: <str>
        }

        """
        clfr_b64 = flask.request.values.get('bytes_b64', default=None)
        label = flask.request.values.get('label', default=None)
        lock_clfr_str = flask.request.values.get('lock_label',
                                                 default='false')

        if clfr_b64 is None or len(clfr_b64) == 0:
            return make_response_json("No state base64 data provided.", 400)
        elif label is None or len(label) == 0:
            return make_response_json("No descriptive label provided.", 400)
        try:
            # This can throw a ValueError if lock_clfr is malformed JSON
            lock_clfr = bool(flask.json.loads(lock_clfr_str))
        except JSON_DECODE_EXCEPTION:
            return make_response_json("Invalid boolean value for"
                                      " 'lock_label'. Was given: '%s'"
                                      % lock_clfr_str,
                                      400)

        # If the given label conflicts with one already in the collection,
        # fail.
        if label in self.classifier_collection.labels():
            return make_response_json("Label '%s' already exists in"
                                      " classifier collection." % label,
                                      400,
                                      label=label)

        clfr = pickle.loads(base64.b64decode(clfr_b64.encode('utf-8')))

        try:
            self.classifier_collection.add_classifier(label, clfr)

            # If we're allowing deletions, get the lock flag from the form
            # and set it for this classifier
            if self.enable_classifier_removal and lock_clfr:
                self.immutable_labels.add(label)

        except ValueError:
            return make_response_json("Data added for label '%s' is not a"
                                      " Classifier." % label,
                                      400,
                                      label=label)

        return make_response_json("Uploaded classifier for label '%s'."
                                  % label,
                                  201,
                                  label=label)
예제 #16
0
    def add_iqr_state_classifier(self):
        """
        Train a classifier based on the user-provided IQR state file bytes in
        a base64 encoding, matched with a descriptive label of that
        classifier's topic.

        Since all IQR session classifiers end up only having two result
        classes (positive and negative), the topic of the classifier is
        encoded in the descriptive label the user applies to the classifier.

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.get('http://localhost:5000/iqr_classifier',
                         data={'bytes_b64': base64.b64encode(data_bytes),
                               'label': 'some_label'})

        With curl on the command line::

            $ curl -X POST localhost:5000/iqr_classifier \
                -d "label=some_label" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/iqr_classifier -d label=some_label \
                --data-urlencode bytes_64@/path/to/file.b64

        To lock this classifier and guard it against deletion, add
        "lock_label=true"::

            $ curl -X POST localhost:5000/iqr_classifier \
                -d "label=some_label" \
                -d "lock_label=true" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

        Form arguments:
            iqr_state_b64
                base64 encoding of the bytes of the IQR session state save
                file.
            label
                Descriptive label to apply to this classifier. This should not
                conflict with existing classifier labels.
            lock_label
                If 'true', disallow deletion of this label. If 'false', allow
                deletion of this label. Only has an effect if deletion is
                enabled for this service. (Default: 'false')

        Returns 201.

        """
        data_b64 = flask.request.values.get('bytes_b64', default=None)
        label = flask.request.values.get('label', default=None)
        lock_clfr_str = flask.request.values.get('lock_label',
                                                 default='false')

        if data_b64 is None or len(data_b64) == 0:
            return make_response_json("No state base64 data provided.", 400)
        elif label is None or len(label) == 0:
            return make_response_json("No descriptive label provided.", 400)
        try:
            lock_clfr = bool(flask.json.loads(lock_clfr_str))
        except JSON_DECODE_EXCEPTION:
            return make_response_json("Invalid boolean value for"
                                      " 'lock_label'. Was given: '%s'"
                                      % lock_clfr_str,
                                      400)
        try:
            # Using urlsafe version because it handles both regular and urlsafe
            # alphabets.
            data_bytes = base64.urlsafe_b64decode(data_b64.encode('utf-8'))
        except (TypeError, binascii.Error) as ex:
            return make_response_json("Invalid base64 input: %s" % str(ex)), \
                   400

        # If the given label conflicts with one already in the collection,
        # fail.
        if label in self.classifier_collection.labels():
            return make_response_json(
                "Label already exists in classifier collection.", 400)

        # Create dummy IqrSession to extract pos/neg descriptors.
        iqrs = IqrSession()
        iqrs.set_state_bytes(data_bytes, self.descriptor_factory)
        pos = iqrs.positive_descriptors | iqrs.external_positive_descriptors
        neg = iqrs.negative_descriptors | iqrs.external_negative_descriptors
        del iqrs

        # Make a classifier instance from the stored config for IQR
        # session-based classifiers.
        #: :type: SupervisedClassifier
        classifier = smqtk.utils.plugin.from_plugin_config(
            self.iqr_state_classifier_config,
            get_classifier_impls(sub_interface=SupervisedClassifier)
        )
        classifier.train(class_examples={'positive': pos, 'negative': neg})

        try:
            self.classifier_collection.add_classifier(label, classifier)

            # If we're allowing deletions, get the lock flag from the form and
            # set it for this classifier
            if self.enable_classifier_removal and lock_clfr:
                self.immutable_labels.add(label)

        except ValueError as e:
            if e.args[0].find('JSON') > -1:
                return make_response_json("Tried to parse malformed JSON in "
                                          "form argument.", 400)
            return make_response_json("Duplicate label ('%s') added during "
                                      "classifier training of provided IQR "
                                      "session state." % label, 400,
                                      label=label)

        return make_response_json("Finished training IQR-session-based "
                                  "classifier for label '%s'." % label,
                                  201,
                                  label=label)
예제 #17
0
    def classify(self):
        """
        Given a file's bytes (standard base64-format) and content mimetype,
        describe and classify the content against all currently stored
        classifiers (optionally a list of requested classifiers), returning a
        map of classifier descriptive labels to their class-to-probability
        results.

        We expect the data to be transmitted in the body of the request in
        standard base64 encoding form ("bytes_b64" key). We look for the
        content type either as URL parameter or within the body
        ("content_type" key).

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.post('http://localhost:5000/classify',
                          data={'bytes_b64': base64.b64encode(data_bytes),
                                'content_type': 'text/plain'})

        With curl on the command line::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode bytes_64@/path/to/file.b64

        Optionally, the `label` parameter can be provided to limit the results
        of classification to a set of classifiers::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode bytes_64@/path/to/file.b64

        Data/Form arguments:
            bytes_b64
                Bytes in the standard base64 encoding to be described and
                classified.
            content_type
                The mimetype of the sent data.
            label
                (Optional) JSON-encoded label or list of labels
            adjustment
                (Optional) JSON-encoded dictionary of labels to floats. Higher
                values lower the gain on the class and therefore correspond to
                higher precision (and lower recall) for the class (and higher
                recall/lower precision for other classes). This translates git to
                calling ``smqtk.utils.prob_utils.adjust_proba``.

        Possible error codes:
            400
                No bytes provided, or provided labels are malformed
            404
                Label or labels provided do not match any registered
                classifier

        Returns: {
            ...
            result: {
                classifier-label: {
                    class-label: prob,
                    ...
                },
                ...
            }
        }

        """
        data_b64 = flask.request.values.get('bytes_b64', default=None)
        content_type = flask.request.values.get('content_type', default=None)
        label_str = flask.request.values.get('label', default=None)
        adjustment_str = flask.request.values.get('adjustment', default=None)

        labels = None
        if label_str is not None:
            try:
                labels = flask.json.loads(label_str)

                if isinstance(labels, six.string_types):
                    labels = [labels]
                elif isinstance(labels, list):
                    for el in labels:
                        if not isinstance(el, six.string_types):
                            return make_response_json(
                                "Label must be a list of strings or a"
                                " single string.", 400)
                else:
                    return make_response_json(
                        "Label must be a list of strings or a single"
                        " string.", 400)

            except JSON_DECODE_EXCEPTION:
                # Unquoted strings aren't valid JSON. That is, a plain string
                # needs to be passed as '"label"' rather than just 'label' or
                # "label". However, we can be a bit more generous and just
                # allow such a string, but we have to place *some* restriction
                # on it. We use `urllib.quote` for this since essentially it
                # just checks to make sure that the string is made up of one
                # of the following types of characters:
                #
                #   - letters
                #   - numbers
                #   - spaces, underscores, periods, and dashes
                #
                # Since the concept of a "letter" is fraught with encoding and
                # locality issues, we simply let urllib make this decision for
                # us.

                # If label_str matches the url-encoded version of itself, go
                # ahead and use it
                if urllib.parse.quote(label_str, safe='') == label_str:
                    labels = [label_str]
                else:
                    return make_response_json(
                        "Label(s) are not properly formatted JSON.", 400)

        # Collect optional result probability adjustment values
        #: :type: dict[collections.Hashable, float]
        adjustments = {}
        if adjustment_str is not None:
            try:
                #: :type: dict[collections.Hashable, float]
                adjustments = flask.json.loads(adjustment_str)

                for label, val in six.iteritems(adjustments):
                    if not isinstance(label, six.string_types):
                        return make_response_json(
                            "Adjustment label '%s' is not a string type."
                            % label,
                            400)
                    if not isinstance(val, (int, float)):
                        return make_response_json(
                            "Adjustment value %s for label '%s' is not an int "
                            "or float" % (val, label),
                            400)
            except JSON_DECODE_EXCEPTION:
                return make_response_json(
                    "Adjustment(s) are not properly formatted JSON.", 400)

        if data_b64 is None:
            return make_response_json("No base-64 bytes provided.", 400)
        elif content_type is None:
            return make_response_json("No content type provided.", 400)

        data_bytes = base64.b64decode(data_b64.encode('utf-8'))
        self._log.debug("Length of byte data: %d" % len(data_bytes))

        data_elem = DataMemoryElement(data_bytes, content_type, readonly=True)
        descr_elem = self.descriptor_gen.compute_descriptor(
            data_elem, self.descriptor_factory
        )
        self._log.debug("Descriptor shape: %s", descr_elem.vector().shape)

        try:
            clfr_map = self.classifier_collection.classify(
                descr_elem, labels=labels,
                factory=self.classification_factory)
        except MissingLabelError as ex:
            return make_response_json(
                "The following labels are not registered with any"
                " classifiers: '%s'"
                % "', '".join(ex.labels),
                404,
                missing_labels=list(ex.labels))

        # Transform classification result into JSON
        c_json = {}
        for classifier_label, c_elem in six.iteritems(clfr_map):
            prediction = c_elem.get_classification()
            if adjustments:
                proba_labels = list(prediction.keys())
                proba = [prediction[k] for k in proba_labels]
                # Use opposite of adjustments, because we already set the
                # convention of "higher: precision, lower: recall"
                adj = [-adjustments.get(label, 0.0) for label in proba_labels]
                adj_proba = prob_utils.adjust_proba(proba, adj)
                prediction = dict(zip(proba_labels, adj_proba[0]))
            c_json[classifier_label] = prediction

        return make_response_json('Finished classification.',
                                  result=c_json)
예제 #18
0
 def is_ready(self):
     """
     Simple endpoint that just means this server is up and responding.
     """
     return make_response_json("Yes, I'm alive!")