def test_adjust_constant(self): num = 10 dim = 3 proba = np.random.rand(num, dim) proba /= proba.sum(axis=1, keepdims=True) self.assertTrue( np.allclose(proba, probability.adjust_proba(proba, [1, 1, 1]))) self.assertTrue( np.allclose(proba, probability.adjust_proba(proba, [10, 10, 10])))
def test_single_class(self): num = 10 dim = 1 proba = np.random.rand(num, dim) self.assertTrue(np.allclose(1, probability.adjust_proba(proba, [1]))) proba = np.random.rand(num, dim) self.assertTrue(np.allclose(1, probability.adjust_proba(proba, [-1]))) proba = np.ones_like(proba) self.assertTrue(np.allclose(1, probability.adjust_proba(proba, [1])))
def test_adjust_serial_vs_sum(self): num = 10 dim = 3 proba = np.random.rand(num, dim) proba /= proba.sum(axis=1, keepdims=True) adj1 = np.array([1, 2, 3]) adj2 = np.array([2, 0, -2]) proba_fst = probability.adjust_proba(proba, adj1) proba_snd = probability.adjust_proba(proba_fst, adj2) proba_sum = probability.adjust_proba(proba, adj1 + adj2) self.assertTrue(np.allclose(proba_snd, proba_sum)) proba_fst = probability.adjust_proba(proba, adj1) proba_snd = probability.adjust_proba(proba_fst, -adj1) self.assertTrue(np.allclose(proba_snd, proba))
def test_adjust(self): num = 10 dim = 3 proba = np.random.rand(num, dim) proba /= proba.sum(axis=1, keepdims=True) adj = [0, 1, 0] proba_post = probability.adjust_proba(proba, adj) comp = proba_post > proba self.assertTrue(np.all([False, True, False] == comp)) comp = proba_post < proba self.assertTrue(np.all([True, False, True] == comp)) comp = np.isclose(proba, proba_post) self.assertFalse(np.any(comp)) adj = [-1, 0, 0] proba_post = probability.adjust_proba(proba, adj) comp = proba_post < proba self.assertTrue(np.all([True, False, False] == comp)) comp = proba_post > proba self.assertTrue(np.all([False, True, True] == comp)) comp = np.isclose(proba, proba_post) self.assertFalse(np.any(comp)) adj = [1.5, 0, -1.5] proba_post = probability.adjust_proba(proba, adj) comp = proba_post < proba self.assertTrue(np.all([False, True] == comp[:, [0, 2]])) comp = proba_post > proba self.assertTrue(np.all([True, False] == comp[:, [0, 2]])) comp = np.isclose(proba, proba_post) self.assertFalse(np.all([False, True, False] == comp))
def test_shape_cases(self): num = 10 dim = 3 proba = np.random.rand(num, dim) self.assertRaisesRegex( ValueError, "The dimensions of probabilities and " "adjustments must be compatible.", probability.adjust_proba, proba, [1, 2]) proba = np.random.rand(1, dim) proba /= proba.sum() self.assertTrue( np.allclose(proba, probability.adjust_proba(proba, [1, 1, 1]))) self.assertRaisesRegex( ValueError, "The dimensions of probabilities and " "adjustments must be compatible.", probability.adjust_proba, np.ones((num, 1)), np.ones((1, num)))
def classify(self): """ Given a file's bytes (standard base64-format) and content mimetype, describe and classify the content against all currently stored classifiers (optionally a list of requested classifiers), returning a map of classifier descriptive labels to their class-to-probability results. We expect the data to be transmitted in the body of the request in standard base64 encoding form ("bytes_b64" key). We look for the content type either as URL parameter or within the body ("content_type" key). Below is an example call to this endpoint via the ``requests`` python module, showing how base64 data is sent:: import base64 import requests data_bytes = "Load some content bytes here." requests.post('http://localhost:5000/classify', data={'bytes_b64': base64.b64encode(data_bytes), 'content_type': 'text/plain'}) With curl on the command line:: $ curl -X POST localhost:5000/classify \ -d "content_type=text/plain" \ --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)" # If this fails, you may wish to encode the file separately and # use the file reference syntax instead: $ base64 -w0 /path/to/file > /path/to/file.b64 $ curl -X POST localhost:5000/classify \ -d "content_type=text/plain" \ --data-urlencode bytes_64@/path/to/file.b64 Optionally, the `label` parameter can be provided to limit the results of classification to a set of classifiers:: $ curl -X POST localhost:5000/classify \ -d "content_type=text/plain" \ -d 'label=["some_label","other_label"]' \ --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)" # If this fails, you may wish to encode the file separately and # use the file reference syntax instead: $ base64 -w0 /path/to/file > /path/to/file.b64 $ curl -X POST localhost:5000/classify \ -d "content_type=text/plain" \ -d 'label=["some_label","other_label"]' \ --data-urlencode bytes_64@/path/to/file.b64 Data/Form arguments: bytes_b64 Bytes in the standard base64 encoding to be described and classified. content_type The mimetype of the sent data. label (Optional) JSON-encoded label or list of labels adjustment (Optional) JSON-encoded dictionary of labels to floats. Higher values lower the gain on the class and therefore correspond to higher precision (and lower recall) for the class (and higher recall/lower precision for other classes). This translates to calling ``smqtk.utils.probability.adjust_proba``. Possible error codes: 400 No bytes provided, or provided labels are malformed 404 Label or labels provided do not match any registered classifier Returns: { ... result: { classifier-label: { class-label: prob, ... }, ... } } """ data_b64 = flask.request.values.get('bytes_b64', default=None) content_type = flask.request.values.get('content_type', default=None) label_str = flask.request.values.get('label', default=None) adjustment_str = flask.request.values.get('adjustment', default=None) labels = None if label_str is not None: try: labels = flask.json.loads(label_str) if isinstance(labels, six.string_types): labels = [labels] elif isinstance(labels, list): for el in labels: if not isinstance(el, six.string_types): return make_response_json( "Label must be a list of strings or a" " single string.", 400) else: return make_response_json( "Label must be a list of strings or a single" " string.", 400) except JSON_DECODE_EXCEPTION: # Unquoted strings aren't valid JSON. That is, a plain string # needs to be passed as '"label"' rather than just 'label' or # "label". However, we can be a bit more generous and just # allow such a string, but we have to place *some* restriction # on it. We use `urllib.quote` for this since essentially it # just checks to make sure that the string is made up of one # of the following types of characters: # # - letters # - numbers # - spaces, underscores, periods, and dashes # # Since the concept of a "letter" is fraught with encoding and # locality issues, we simply let urllib make this decision for # us. # If label_str matches the url-encoded version of itself, go # ahead and use it if urllib.parse.quote(label_str, safe='') == label_str: labels = [label_str] else: return make_response_json( "Label(s) are not properly formatted JSON.", 400) # Collect optional result probability adjustment values #: :type: dict[collections.Hashable, float] adjustments = {} if adjustment_str is not None: try: #: :type: dict[collections.Hashable, float] adjustments = flask.json.loads(adjustment_str) for label, val in six.iteritems(adjustments): if not isinstance(label, six.string_types): return make_response_json( "Adjustment label '%s' is not a string type." % label, 400) if not isinstance(val, (int, float)): return make_response_json( "Adjustment value %s for label '%s' is not an int " "or float" % (val, label), 400) except JSON_DECODE_EXCEPTION: return make_response_json( "Adjustment(s) are not properly formatted JSON.", 400) if data_b64 is None: return make_response_json("No base-64 bytes provided.", 400) elif content_type is None: return make_response_json("No content type provided.", 400) data_bytes = base64.b64decode(data_b64.encode('utf-8')) self._log.debug("Length of byte data: %d" % len(data_bytes)) data_elem = DataMemoryElement(data_bytes, content_type, readonly=True) descr_elem = self.descriptor_gen.generate_one_element( data_elem, descr_factory=self.descriptor_factory) self._log.debug("Descriptor shape: %s", descr_elem.vector().shape) try: clfr_map = self.classifier_collection.classify( descr_elem, labels=labels, factory=self.classification_factory) except MissingLabelError as ex: return make_response_json( "The following labels are not registered with any" " classifiers: '%s'" % "', '".join(ex.labels), 404, missing_labels=list(ex.labels)) # Transform classification result into JSON c_json = {} for classifier_label, c_elem in six.iteritems(clfr_map): prediction = c_elem.get_classification() if adjustments: proba_labels = list(prediction.keys()) proba = [prediction[k] for k in proba_labels] # Use opposite of adjustments, because we already set the # convention of "higher: precision, lower: recall" adj = [-adjustments.get(label, 0.0) for label in proba_labels] adj_proba = probability.adjust_proba(proba, adj) prediction = dict(zip(proba_labels, adj_proba[0])) c_json[classifier_label] = prediction return make_response_json('Finished classification.', result=c_json)
def classify(self): """ Given a file's bytes (standard base64-format) and content mimetype, describe and classify the content against all currently stored classifiers (optionally a list of requested classifiers), returning a map of classifier descriptive labels to their class-to-probability results. We expect the data to be transmitted in the body of the request in standard base64 encoding form ("bytes_b64" key). We look for the content type either as URL parameter or within the body ("content_type" key). Below is an example call to this endpoint via the ``requests`` python module, showing how base64 data is sent:: import base64 import requests data_bytes = "Load some content bytes here." requests.post('http://localhost:5000/classify', data={'bytes_b64': base64.b64encode(data_bytes), 'content_type': 'text/plain'}) With curl on the command line:: $ curl -X POST localhost:5000/classify \ -d "content_type=text/plain" \ --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)" # If this fails, you may wish to encode the file separately and # use the file reference syntax instead: $ base64 -w0 /path/to/file > /path/to/file.b64 $ curl -X POST localhost:5000/classify \ -d "content_type=text/plain" \ --data-urlencode bytes_64@/path/to/file.b64 Optionally, the `label` parameter can be provided to limit the results of classification to a set of classifiers:: $ curl -X POST localhost:5000/classify \ -d "content_type=text/plain" \ -d 'label=["some_label","other_label"]' \ --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)" # If this fails, you may wish to encode the file separately and # use the file reference syntax instead: $ base64 -w0 /path/to/file > /path/to/file.b64 $ curl -X POST localhost:5000/classify \ -d "content_type=text/plain" \ -d 'label=["some_label","other_label"]' \ --data-urlencode bytes_64@/path/to/file.b64 Data/Form arguments: bytes_b64 Bytes in the standard base64 encoding to be described and classified. content_type The mimetype of the sent data. label (Optional) JSON-encoded label or list of labels adjustment (Optional) JSON-encoded dictionary of labels to floats. Higher values lower the gain on the class and therefore correspond to higher precision (and lower recall) for the class (and higher recall/lower precision for other classes). This translates to calling ``smqtk.utils.probability.adjust_proba``. Possible error codes: 400 No bytes provided, or provided labels are malformed 404 Label or labels provided do not match any registered classifier Returns: { ... result: { classifier-label: { class-label: prob, ... }, ... } } """ data_b64 = flask.request.values.get('bytes_b64', default=None) content_type = flask.request.values.get('content_type', default=None) label_str = flask.request.values.get('label', default=None) adjustment_str = flask.request.values.get('adjustment', default=None) try: labels = labels_from_input(label_str) except ValueError as ex: return make_response_json(f"Invalid label(s) specified: {ex}", 400) # Collect optional result probability adjustment values #: :type: dict[collections.abc.Hashable, float] adjustments = {} if adjustment_str is not None: try: #: :type: dict[collections.abc.Hashable, float] adjustments = json.loads(adjustment_str) for label, val in six.iteritems(adjustments): if not isinstance(label, six.string_types): return make_response_json( "Adjustment label '%s' is not a string type." % label, 400) if not isinstance(val, (int, float)): return make_response_json( "Adjustment value %s for label '%s' is not an int " "or float" % (val, label), 400) except json.JSONDecodeError: return make_response_json( "Adjustment(s) are not properly formatted JSON.", 400) if data_b64 is None: return make_response_json("No base-64 bytes provided.", 400) elif content_type is None: return make_response_json("No content type provided.", 400) data_bytes = base64.b64decode(data_b64.encode('utf-8')) self._log.debug("Length of byte data: %d" % len(data_bytes)) data_elem = DataMemoryElement(data_bytes, content_type, readonly=True) descr_elem = self.descriptor_gen.generate_one_element( data_elem, descr_factory=self.descriptor_factory ) self._log.debug("Descriptor shape: %s", descr_elem.vector().shape) try: clfr_map = self.classifier_collection.classify( descr_elem, labels=labels, factory=self.classification_factory) except MissingLabelError as ex: return make_response_json( "The following labels are not registered with any" " classifiers: '%s'" % "', '".join(ex.labels), 404, missing_labels=list(ex.labels)) # Transform classification result into JSON c_json = {} for classifier_label, c_elem in six.iteritems(clfr_map): prediction = c_elem.get_classification() if adjustments: proba_labels = list(prediction.keys()) proba = [prediction[k] for k in proba_labels] # Use opposite of adjustments, because we already set the # convention of "higher: precision, lower: recall" adj = [-adjustments.get(label, 0.0) for label in proba_labels] adj_proba = probability.adjust_proba(proba, adj) prediction = dict(zip(proba_labels, adj_proba[0])) c_json[classifier_label] = prediction return make_response_json('Finished classification.', result=c_json)