def main(): parser = cli_parser() args = parser.parse_args() config = cli.utility_main_helper(default_config, args) log = logging.getLogger(__name__) output_filepath = args.output_filepath overwrite = args.overwrite if not args.input_file: log.error("Failed to provide an input file path") exit(1) elif not os.path.isfile(args.input_file): log.error("Given path does not point to a file.") exit(1) input_filepath = args.input_file data_element = DataFileElement(input_filepath) factory = DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.algorithms.descriptor_generator.DescriptorGenerator cd = from_config_dict(config['content_descriptor'], DescriptorGenerator.get_impls()) vec = generate_vector(log, cd, data_element, factory, overwrite) if output_filepath: numpy.save(output_filepath, vec) else: # Construct string, because numpy s = [] # noinspection PyTypeChecker for f in vec: s.append('%15f' % f) print(' '.join(s))
def from_config(cls, config, parent_app): """ Instantiate a new instance of this class given the configuration JSON-compliant dictionary encapsulating initialization arguments. :param config: JSON compliant dictionary encapsulating a configuration. :type config: dict :param parent_app: Parent containing flask app instance :type parent_app: smqtk.web.search_app.app.search_app :return: Constructed instance from the provided config. :rtype: IqrSearch """ merged = cls.get_default_config() merged.update(config) # construct nested objects via configurations merged['data_set'] = \ plugin.from_plugin_config(merged['data_set'], get_data_set_impls()) merged['descr_generator'] = \ plugin.from_plugin_config(merged['descr_generator'], get_descriptor_generator_impls()) merged['nn_index'] = \ plugin.from_plugin_config(merged['nn_index'], get_nn_index_impls()) merged['descriptor_factory'] = \ DescriptorElementFactory.from_config(merged['descriptor_factory']) return cls(parent_app, **merged)
def from_config(cls, config_dict, type_str, uuid, merge_default=True): # convert factory configuration config_dict["wrapped_element_factory"] = DescriptorElementFactory.from_config( config_dict["wrapped_element_factory"] ) return super(CachingDescriptorElement, cls).from_config(config_dict, type_str, uuid, merge_default)
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: :param check_image: Enable checking image loading from file before queueing that file for processing. If the check fails, the file is skipped instead of a halting exception being raised. :type check_image: bool """ log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config(c['descriptor_index'], get_descriptor_index_impls()) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) #: :type: smqtk.algorithms.DescriptorGenerator generator = plugin.from_plugin_config(c['descriptor_generator'], get_descriptor_generator_impls()) def test_image_load(dfe): try: PIL.Image.open(io.BytesIO(dfe.get_bytes())) return True except IOError, ex: # noinspection PyProtectedMember log.warn( "Failed to convert '%s' bytes into an image " "(error: %s). Skipping", dfe._filepath, str(ex)) return False
def from_config(cls, config_dict, type_str, uuid, merge_default=True): # convert factory configuration config_dict['wrapped_element_factory'] = \ DescriptorElementFactory.from_config( config_dict['wrapped_element_factory'] ) return super(CachingDescriptorElement, cls).from_config(config_dict, type_str, uuid, merge_default)
def run_file_list(c, filelist_filepath, checkpoint_filepath): log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making memory factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) #: :type: smqtk.algorithms.DescriptorGenerator generator = from_plugin_config(c['descriptor_generator'], get_descriptor_generator_impls) log.info("Making descriptor generator -- Done") valid_file_paths = dict() invalid_file_paths = dict() def iter_valid_files(): for fp in file_paths: dfe = DataFileElement(fp) ct = dfe.content_type() if ct in generator.valid_content_types(): valid_file_paths[fp] = ct yield fp else: invalid_file_paths[fp] = ct log.info("Computing descriptors") m = compute_many_descriptors(iter_valid_files(), generator, factory, batch_size=256, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'a') try: for fp, descr in m: cf.write("{:s},{:s}\n".format( fp, descr.uuid() )) cf.flush() finally: cf.close() # Output valid file and invalid file dictionaries as pickle log.info("Writing valid filepaths map") with open('valid_file_map.pickle', 'wb') as f: cPickle.dump(valid_file_paths, f) log.info("Writing invalid filepaths map") with open('invalid_file_map.pickle', 'wb') as f: cPickle.dump(invalid_file_paths, f) log.info("Done")
def from_config(cls, config_dict, type_str, uuid): merged_config = cls.get_default_config() merged_config.update(config_dict) # convert factory configuration merged_config['wrapped_element_factory'] = \ DescriptorElementFactory.from_config( merged_config['wrapped_element_factory'] ) return super(CachingDescriptorElement, cls).from_config(merged_config, type_str, uuid)
def __init__(self, json_config): super(SmqtkClassifierService, self).__init__(json_config) self.enable_classifier_removal = \ bool(json_config[self.CONFIG_ENABLE_CLASSIFIER_REMOVAL]) self.immutable_labels = set(json_config[self.CONFIG_IMMUTABLE_LABELS]) # Convert configuration into SMQTK plugin instances. # - Static classifier configurations. # - Skip the example config key # - Classification element factory # - Descriptor generator # - Descriptor element factory # - from-IQR-state classifier configuration # - There must at least be the default key defined for when no # specific classifier type is specified at state POST. # Classifier collection + factor self.classification_factory = \ ClassificationElementFactory.from_config( json_config[self.CONFIG_CLASSIFICATION_FACTORY] ) #: :type: ClassifierCollection self.classifier_collection = ClassifierCollection.from_config( json_config[self.CONFIG_CLASSIFIER_COLLECTION] ) # Descriptor generator + factory self.descriptor_factory = DescriptorElementFactory.from_config( json_config[self.CONFIG_DESCRIPTOR_FACTORY] ) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_gen = from_config_dict( json_config[self.CONFIG_DESCRIPTOR_GENERATOR], smqtk.algorithms.DescriptorGenerator.get_impls() ) # Descriptor set bundled for classification-by-UID. try: self.descriptor_set = from_config_dict( json_config.get(self.CONFIG_DESCRIPTOR_SET, {}), DescriptorSet.get_impls() ) except ValueError: # Default empty set. self.descriptor_set = MemoryDescriptorSet() # Classifier config for uploaded IQR states. self.iqr_state_classifier_config = \ json_config[self.CONFIG_IQR_CLASSIFIER] self.add_routes()
def from_config(cls, config_dict, type_str, uuid): merged_config = cls.get_default_config() merged_config.update(config_dict) # convert factory configuration merged_config['wrapped_element_factory'] = \ DescriptorElementFactory.from_config( merged_config['wrapped_element_factory'] ) return super(CachingDescriptorElement, cls).from_config( merged_config, type_str, uuid )
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: """ log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config(c['descriptor_index'], get_descriptor_index_impls()) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) #: :type: smqtk.algorithms.DescriptorGenerator generator = plugin.from_plugin_config(c['descriptor_generator'], get_descriptor_generator_impls()) def test_image_load(dfe): try: PIL.Image.open(io.BytesIO(dfe.get_bytes())) return True except IOError, ex: # noinspection PyProtectedMember log.warn("Failed to convert '%s' bytes into an image " "(error: %s). Skipping", dfe._filepath, str(ex)) return False
def test_configuration(self): c = DescriptorElementFactory.get_default_config() ntools.assert_is_none(c['type']) ntools.assert_in('DescriptorMemoryElement', c) c['type'] = 'DescriptorMemoryElement' factory = DescriptorElementFactory.from_config(c) ntools.assert_equal(factory._d_type.__name__, DescriptorMemoryElement.__name__) ntools.assert_equal(factory._d_type_config, {}) d = factory.new_descriptor('test', 'foo') ntools.assert_equal(d.type(), 'test') ntools.assert_equal(d.uuid(), 'foo')
def test_configuration(self): c = DescriptorElementFactory.get_default_config() self.assertIsNone(c['type']) self.assertIn('DescriptorMemoryElement', c) c['type'] = 'DescriptorMemoryElement' factory = DescriptorElementFactory.from_config(c) self.assertEqual(factory._d_type.__name__, DescriptorMemoryElement.__name__) self.assertEqual(factory._d_type_config, {}) d = factory.new_descriptor('test', 'foo') self.assertEqual(d.type(), 'test') self.assertEqual(d.uuid(), 'foo')
def test_configuration(self): c = DescriptorElementFactory.get_default_config() ntools.assert_is_none(c['type']) ntools.assert_in('DescriptorMemoryElement', c) c['type'] = 'DescriptorMemoryElement' factory = DescriptorElementFactory.from_config(c) ntools.assert_equal(factory._d_type.__name__, DescriptorMemoryElement.__name__) ntools.assert_equal(factory._d_type_config, {}) d = factory.new_descriptor('test', 'foo') ntools.assert_equal(d.type(), 'test') ntools.assert_equal(d.uuid(), 'foo') DescriptorMemoryElement.MEMORY_CACHE = {}
def test_configuration(self): c = DescriptorElementFactory.get_default_config() self.assertIsNone(c['type']) dme_key = 'smqtk.representation.descriptor_element.local_elements.DescriptorMemoryElement' self.assertIn(dme_key, c) c['type'] = dme_key factory = DescriptorElementFactory.from_config(c) self.assertEqual(factory._d_type.__name__, DescriptorMemoryElement.__name__) self.assertEqual(factory._d_type_config, {}) d = factory.new_descriptor('test', 'foo') self.assertEqual(d.type(), 'test') self.assertEqual(d.uuid(), 'foo')
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size): log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c["descriptor_factory"]) log.info("Making descriptor generator '%s'", c["descriptor_generator"]["type"]) #: :type: smqtk.algorithms.DescriptorGenerator generator = from_plugin_config(c["descriptor_generator"], get_descriptor_generator_impls) log.info("Making descriptor generator -- Done") valid_file_paths = dict() invalid_file_paths = dict() def iter_valid_elements(): for fp in file_paths: dfe = DataFileElement(fp) ct = dfe.content_type() if ct in generator.valid_content_types(): valid_file_paths[fp] = ct yield dfe else: invalid_file_paths[fp] = ct log.info("Computing descriptors") m = compute_many_descriptors(iter_valid_elements(), generator, factory, batch_size=batch_size) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, "a") try: for fp, descr in m: cf.write("{:s},{:s}\n".format(fp, descr.uuid())) cf.flush() finally: cf.close() # Output valid file and invalid file dictionaries as pickle log.info("Writing valid filepaths map") with open("file_map.valid.pickle", "wb") as f: cPickle.dump(valid_file_paths, f) log.info("Writing invalid filepaths map") with open("file_map.invalid.pickle", "wb") as f: cPickle.dump(invalid_file_paths, f) log.info("Done")
def __init__(self, json_config): super(SmqtkClassifierService, self).__init__(json_config) self.enable_classifier_removal = \ bool(json_config[self.CONFIG_ENABLE_CLASSIFIER_REMOVAL]) self.immutable_labels = set(json_config[self.CONFIG_IMMUTABLE_LABELS]) # Convert configuration into SMQTK plugin instances. # - Static classifier configurations. # - Skip the example config key # - Classification element factory # - Descriptor generator # - Descriptor element factory # - from-IQR-state classifier configuration # - There must at least be the default key defined for when no # specific classifier type is specified at state POST. # Classifier collection + factor self.classification_factory = \ ClassificationElementFactory.from_config( json_config[self.CONFIG_CLASSIFICATION_FACTORY] ) self.classifier_collection = ClassifierCollection.from_config( json_config[self.CONFIG_CLASSIFIER_COLLECTION] ) # Descriptor generator + factory self.descriptor_factory = DescriptorElementFactory.from_config( json_config[self.CONFIG_DESCRIPTOR_FACTORY] ) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_gen = smqtk.utils.plugin.from_plugin_config( json_config[self.CONFIG_DESCRIPTOR_GENERATOR], smqtk.algorithms.get_descriptor_generator_impls() ) # Classifier config for uploaded IQR states. self.iqr_state_classifier_config = \ json_config[self.CONFIG_IQR_CLASSIFIER] self.add_routes()
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(DescriptorServiceServer, self).__init__(json_config) # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory']) # Descriptor generator configuration labels #: :type: dict[str, dict] self.generator_label_configs = self.json_config[ 'descriptor_generators'] # Cache of DescriptorGenerator instances so we don't have to # continuously initialize them as we get requests. self.descriptor_cache = {} self.descriptor_cache_lock = multiprocessing.RLock() @self.route("/") def list_ingest_labels(): return flask.jsonify( {"labels": sorted(self.generator_label_configs)}) @self.route("/all/content_types") def all_content_types(): """ Of available descriptors, what content types are processable, and what types are associated to which available descriptor generator. """ all_types = set() # Mapping of configuration label to content types that generator # can handle r = {} for l in self.generator_label_configs: d = self.get_descriptor_inst(l) all_types.update(d.valid_content_types()) r[l] = sorted(d.valid_content_types()) return flask.jsonify({"all": sorted(all_types), "by-label": r}) @self.route("/all/compute/<path:uri>") def all_compute(uri): """ Compute descriptors over the specified content for all generators that function over the data's content type. JSON Return format:: { "success": <bool> "content_type": <str> "message": <str> "descriptors": { "<label>": <list[float]>, ... } | None "reference_uri": <str> } """ message = "execution nominal" data_elem = None try: data_elem = self.resolve_data_element(uri) except ValueError as ex: message = "Failed URI resolution: %s" % str(ex) descriptors = {} finished_loop = False if data_elem: for l in self.generator_label_configs: if data_elem.content_type() in \ self.get_descriptor_inst(l).valid_content_types(): d = None try: d = self.generate_descriptor(data_elem, l) except RuntimeError as ex: message = "Descriptor extraction failure: %s" \ % str(ex) except ValueError as ex: message = "Data content type issue: %s" % str(ex) descriptors[l] = d and d.vector().tolist() if not descriptors: message = "No descriptors can handle URI content type: %s" \ % data_elem.content_type else: finished_loop = True return flask.jsonify({ "success": finished_loop, "content_type": data_elem.content_type(), "message": message, "descriptors": descriptors, "reference_uri": uri }) @self.route("/<string:descriptor_label>/<path:uri>") def compute_descriptor(descriptor_label, uri): """ Data modes for upload/use:: - local filepath - base64 - http/s URL The following sub-sections detail how different URI's can be used. Local Filepath -------------- The URI string must be prefixed with ``file://``, followed by the full path to the data file to describe. Base 64 data ------------ The URI string must be prefixed with "base64://", followed by the base64 encoded string. This mode also requires an additional ``?content_type=`` to provide data content type information. This mode saves the encoded data to temporary file for processing. HTTP/S address -------------- This is the default mode when the URI prefix is none of the above. This uses the requests module to locally download a data file for processing. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :type descriptor_label: str :type uri: str """ message = "execution nominal" descriptor = None de = None try: de = self.resolve_data_element(uri) except ValueError as ex: message = "URI resolution issue: %s" % str(ex) if de: try: descriptor = self.generate_descriptor(de, descriptor_label) except RuntimeError as ex: message = "Descriptor extraction failure: %s" % str(ex) except ValueError as ex: message = "Data content type issue: %s" % str(ex) return flask.jsonify({ "success": descriptor is not None, "message": message, "descriptor": (descriptor is not None and descriptor.vector().tolist()) or None, "reference_uri": uri })
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: :param check_image: Enable checking image loading from file before queueing that file for processing. If the check fails, the file is skipped instead of a halting exception being raised. :type check_image: bool """ log = logging.getLogger(__name__) file_paths = [line.strip() for line in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") descriptor_set = cast( DescriptorSet, from_config_dict(c['descriptor_set'], DescriptorSet.get_impls())) # ``data_set`` added to within the ``iter_valid_elements`` function. data_set: Optional[DataSet] = None if c['optional_data_set']['type'] is None: log.info("Not saving loaded data elements to data set") else: log.info("Initializing data set to append to") data_set = cast( DataSet, from_config_dict(c['optional_data_set'], DataSet.get_impls())) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) generator = cast( DescriptorGenerator, from_config_dict(c['descriptor_generator'], DescriptorGenerator.get_impls())) def iter_valid_elements(): def is_valid(file_path): e = DataFileElement(file_path) if is_valid_element( e, valid_content_types=generator.valid_content_types(), check_image=check_image): return e else: return False data_elements: Deque[DataFileElement] = collections.deque() valid_files_filter = parallel.parallel_map(is_valid, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe: yield dfe if data_set is not None: data_elements.append(dfe) if batch_size and len(data_elements) == batch_size: log.debug( "Adding data element batch to set (size: %d)", len(data_elements)) data_set.add_data(*data_elements) data_elements.clear() # elements only collected if we have a data-set configured, so add any # still in the deque to the set if data_set is not None and data_elements: log.debug("Adding data elements to set (size: %d", len(data_elements)) data_set.add_data(*data_elements) log.info("Computing descriptors") m = compute_many_descriptors( iter_valid_elements(), generator, factory, descriptor_set, batch_size=batch_size, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'w') cf_writer = csv.writer(cf) try: pr = ProgressReporter(log.debug, 1.0).start() for de, descr in m: # We know that we are using DataFileElements going into the # compute_many_descriptors, so we can assume that's what comes out # of it as well. # noinspection PyProtectedMember cf_writer.writerow([de._filepath, descr.uuid()]) pr.increment_report() pr.report() finally: del cf_writer cf.close() log.info("Done")
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(DescriptorServiceServer, self).__init__(json_config) # Descriptor factory setup self.log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory'] ) # Descriptor generator configuration labels #: :type: dict[str, dict] self.generator_label_configs = self.json_config['descriptor_generators'] # Cache of DescriptorGenerator instances so we don't have to continuously # initialize them as we get requests. self.descriptor_cache = {} self.descriptor_cache_lock = multiprocessing.RLock() @self.route("/") def list_ingest_labels(): return flask.jsonify({ "labels": sorted(self.generator_label_configs.iterkeys()) }) @self.route("/all/content_types") def all_content_types(): """ Of available descriptors, what content types are processable, and what types are associated to which available descriptor generator. """ all_types = set() # Mapping of configuration label to content types that generator # can handle r = {} for l in self.generator_label_configs: d = self.get_descriptor_inst(l) all_types.update(d.valid_content_types()) r[l] = sorted(d.valid_content_types()) return flask.jsonify({ "all": sorted(all_types), "by-label": r }) @self.route("/all/compute/<path:uri>") def all_compute(uri): """ Compute descriptors over the specified content for all generators that function over the data's content type. JSON Return format:: { "success": <bool> "content_type": <str> "message": <str> "descriptors": { "<label>": <list[float]>, ... } | None "reference_uri": <str> } """ message = "execution nominal" data_elem = None try: data_elem = self.resolve_data_element(uri) except ValueError, ex: message = "Failed URI resolution: %s" % str(ex) descriptors = {} finished_loop = False if data_elem: for l in self.generator_label_configs: if data_elem.content_type() \ in self.get_descriptor_inst(l).valid_content_types(): d = None try: d = self.generate_descriptor(data_elem, l) except RuntimeError, ex: message = "Descriptor extraction failure: %s" \ % str(ex) except ValueError, ex: message = "Data content type issue: %s" % str(ex) descriptors[l] = d and d.vector().tolist()
def main(): parser = cli_parser() args = parser.parse_args() output_filepath = args.output_filepath overwrite = args.overwrite verbose = args.verbose llevel = logging.DEBUG if verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") # Merge loaded config with default config_loaded = False config = default_config() if args.config: if os.path.isfile(args.config): with open(args.config, 'r') as f: config.update(json.load(f)) config_loaded = True elif not os.path.isfile(args.config): log.error("Configuration file path not valid.") exit(1) bin_utils.output_config(args.output_config, config, log, True) # Configuration must have been loaded at this point since we can't normally # trust the default. if not config_loaded: log.error("No configuration provided") exit(1) if not args.input_file: log.error("Failed to provide an input file path") exit(1) elif not os.path.isfile(args.input_file): log.error("Given path does not point to a file.") exit(1) input_filepath = args.input_file data_element = DataFileElement(input_filepath) factory = DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.algorithms.descriptor_generator.DescriptorGenerator cd = plugin.from_plugin_config(config['content_descriptor'], get_descriptor_generator_impls()) descr_elem = cd.compute_descriptor(data_element, factory, overwrite) vec = descr_elem.vector() if vec is None: log.error("Failed to generate a descriptor vector for the input data!") if output_filepath: numpy.save(output_filepath, vec) else: # Construct string, because numpy s = [] # noinspection PyTypeChecker for f in vec: s.append('%15f' % f) print ' '.join(s)
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) # Descriptor factory setup self.log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory']) # Descriptor generator configuration labels #: :type: dict[str, dict] self.generator_config = self.json_config['descriptor_generator'] #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config(json_config['nn_index'], get_nn_index_impls) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.generator_config, get_descriptor_generator_impls) @self.route("/nn/<path:uri>") @self.route("/nn/n=<int:n>/<path:uri>") @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>") def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None): """ Data modes for upload/use:: - local filepath - base64 - http/s URL The following sub-sections detail how different URI's can be used. Local Filepath -------------- The URI string must be prefixed with ``file://``, followed by the full path to the data file to describe. Base 64 data ------------ The URI string must be prefixed with "base64://", followed by the base64 encoded string. This mode also requires an additional ``?content_type=`` to provide data content type information. This mode saves the encoded data to temporary file for processing. HTTP/S address -------------- This is the default mode when the URI prefix is none of the above. This uses the requests module to locally download a data file for processing. JSON Return format:: { "success": <bool> "message": <str> "neighbors": <None|list[float]> "reference_uri": <str> } :type uri: str """ message = "execution nominal" descriptor = None de = None try: self.log.debug("Received URI: %s", uri) de = self.resolve_data_element(uri) except ValueError, ex: message = "URI resolution issue: %s" % str(ex) if de: try: descriptor = self.descriptor_generator_inst.\ compute_descriptor(de, self.descr_elem_factory) except RuntimeError, ex: message = "Descriptor extraction failure: %s" % str(ex) except ValueError, ex: message = "Data content type issue: %s" % str(ex)
def classify_files(config, label, file_globs): log = logging.getLogger(__name__) #: :type: smqtk.algorithms.Classifier classifier = \ plugin.from_plugin_config(config['classifier'], get_classifier_impls()) def log_avaialable_labels(): log.info("Available classifier labels:") for l in classifier.get_labels(): log.info("- %s", l) if label is None: log_avaialable_labels() return elif label not in classifier.get_labels(): log.error("Invalid classification label provided to compute and filter " "on: '%s'", label) log_avaialable_labels() return log.info("Collecting files from globs") #: :type: list[DataFileElement] data_elements = [] uuid2filepath = {} for g in file_globs: if os.path.isfile(g): d = DataFileElement(g) data_elements.append(d) uuid2filepath[d.uuid()] = g else: log.debug("expanding glob: %s", g) for fp in glob.iglob(g): d = DataFileElement(fp) data_elements.append(d) uuid2filepath[d.uuid()] = fp if not data_elements: raise RuntimeError("No files provided for classification.") log.info("Computing descriptors") descriptor_factory = \ DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(config['descriptor_generator'], get_descriptor_generator_impls()) descr_map = descriptor_generator\ .compute_descriptor_async(data_elements, descriptor_factory) log.info("Classifying descriptors") classification_factory = ClassificationElementFactory \ .from_config(config['classification_factory']) classification_map = classifier\ .classify_async(list(descr_map.values()), classification_factory) log.info("Printing input file paths that classified as the given label.") # map of UUID to filepath: uuid2c = dict((c.uuid, c) for c in six.itervalues(classification_map)) for data in data_elements: d_uuid = data.uuid() log.debug("'{}' classification map: {}".format( uuid2filepath[d_uuid], uuid2c[d_uuid].get_classification() )) if uuid2c[d_uuid].max_label() == label: print(uuid2filepath[d_uuid])
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(DescriptorServiceServer, self).__init__(json_config) # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory'] ) # Descriptor generator configuration labels #: :type: dict[str, dict] self.generator_label_configs = self.json_config['descriptor_generators'] # Cache of DescriptorGenerator instances so we don't have to # continuously initialize them as we get requests. self.descriptor_cache = {} self.descriptor_cache_lock = multiprocessing.RLock() @self.route("/") def list_ingest_labels(): return flask.jsonify({ "labels": sorted(self.generator_label_configs) }) @self.route("/all/content_types") def all_content_types(): """ Of available descriptors, what content types are processable, and what types are associated to which available descriptor generator. """ all_types = set() # Mapping of configuration label to content types that generator # can handle r = {} for l in self.generator_label_configs: d = self.get_descriptor_inst(l) all_types.update(d.valid_content_types()) r[l] = sorted(d.valid_content_types()) return flask.jsonify({ "all": sorted(all_types), "by-label": r }) @self.route("/all/compute/<path:uri>") def all_compute(uri): """ Compute descriptors over the specified content for all generators that function over the data's content type. JSON Return format:: { "success": <bool> "content_type": <str> "message": <str> "descriptors": { "<label>": <list[float]>, ... } | None "reference_uri": <str> } """ message = "execution nominal" data_elem = None try: data_elem = self.resolve_data_element(uri) except ValueError as ex: message = "Failed URI resolution: %s" % str(ex) descriptors = {} finished_loop = False if data_elem: for l in self.generator_label_configs: if data_elem.content_type() in \ self.get_descriptor_inst(l).valid_content_types(): d = None try: d = self.generate_descriptor(data_elem, l) except RuntimeError as ex: message = "Descriptor extraction failure: %s" \ % str(ex) except ValueError as ex: message = "Data content type issue: %s" % str(ex) descriptors[l] = d and d.vector().tolist() if not descriptors: message = "No descriptors can handle URI content type: %s" \ % data_elem.content_type else: finished_loop = True return flask.jsonify({ "success": finished_loop, "content_type": data_elem.content_type(), "message": message, "descriptors": descriptors, "reference_uri": uri }) @self.route("/<string:descriptor_label>/<path:uri>") def compute_descriptor(descriptor_label, uri): """ Data modes for upload/use:: - local filepath - base64 - http/s URL The following sub-sections detail how different URI's can be used. Local Filepath -------------- The URI string must be prefixed with ``file://``, followed by the full path to the data file to describe. Base 64 data ------------ The URI string must be prefixed with "base64://", followed by the base64 encoded string. This mode also requires an additional ``?content_type=`` to provide data content type information. This mode saves the encoded data to temporary file for processing. HTTP/S address -------------- This is the default mode when the URI prefix is none of the above. This uses the requests module to locally download a data file for processing. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :type descriptor_label: str :type uri: str """ message = "execution nominal" descriptor = None de = None try: de = self.resolve_data_element(uri) except ValueError as ex: message = "URI resolution issue: %s" % str(ex) if de: try: descriptor = self.generate_descriptor(de, descriptor_label) except RuntimeError as ex: message = "Descriptor extraction failure: %s" % str(ex) except ValueError as ex: message = "Data content type issue: %s" % str(ex) return flask.jsonify({ "success": descriptor is not None, "message": message, "descriptor": (descriptor is not None and descriptor.vector().tolist()) or None, "reference_uri": uri })
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) self.update_index = json_config['update_descriptor_index'] # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory'] ) #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = None if self.update_index: self._log.info("Initializing DescriptorIndex to update") #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = plugin.from_plugin_config( json_config['descriptor_index'], get_descriptor_index_impls() ) #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config( json_config['nn_index'], get_nn_index_impls() ) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.json_config['descriptor_generator'], get_descriptor_generator_impls() ) @self.route("/count", methods=['GET']) def count(): """ Return the number of elements represented in this index. """ return flask.jsonify(**{ "count": self.nn_index.count(), }) @self.route("/compute/<path:uri>", methods=["POST"]) def compute(uri): """ Compute the descriptor for a URI specified data element using the configured descriptor generator. If the a descriptor index was configured and update was turned on, we add the computed descriptor to the index. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :param uri: URI data specification. """ descriptor = None try: _, descriptor = self.generate_descriptor_for_uri(uri) message = "Descriptor generated" descriptor = map(float, descriptor.vector()) except ValueError, ex: message = "Input value issue: %s" % str(ex) except RuntimeError, ex: message = "Descriptor extraction failure: %s" % str(ex)
def main(): usage = "%prog [OPTIONS] INPUT_FILE" description = """\ Compute a descriptor vector for a given data file, outputting the generated feature vector to standard out, or to an output file if one was specified (in numpy format). """ parser = bin_utils.SMQTKOptParser(usage, description=description) group_labels = optparse.OptionGroup(parser, "Configuration") group_labels.add_option('-c', '--config', default=None, help='Path to the JSON configuration file.') group_labels.add_option('--output-config', default=None, help='Optional path to output default JSON ' 'configuration to.') parser.add_option_group(group_labels) group_optional = optparse.OptionGroup(parser, "Optional Parameters") group_optional.add_option('--overwrite', action='store_true', default=False, help="Force descriptor computation even if an " "existing descriptor vector was discovered " "based on the given content descriptor type " "and data combination.") group_optional.add_option('-o', '--output-filepath', help='Optional path to a file to output feature ' 'vector to. Otherwise the feature vector is ' 'printed to standard out. Output is saved ' 'in numpy binary format (.npy suffix ' 'recommended).') group_optional.add_option('-v', '--verbose', action='store_true', default=False, help='Print additional debugging messages. All ' 'logging goes to standard error.') parser.add_option_group(group_optional) opts, args = parser.parse_args() output_filepath = opts.output_filepath overwrite = opts.overwrite verbose = opts.verbose llevel = logging.DEBUG if verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") bin_utils.output_config(opts.output_config, default_config(), log) if not opts.config: log.error("No configuration provided") exit(1) elif not os.path.isfile(opts.config): log.error("Configuration file path not valid.") exit(1) if len(args) == 0: log.error("Failed to provide an input file path") exit(1) if len(args) > 1: log.warning("More than one filepath provided as an argument. Only " "computing for the first one.") with open(opts.config, 'r') as f: config = json.load(f) input_filepath = args[0] data_element = DataFileElement(input_filepath) factory = DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.descriptor_generator.DescriptorGenerator cd = plugin.from_plugin_config(config['content_descriptor'], get_descriptor_generator_impls) descr_elem = cd.compute_descriptor(data_element, factory, overwrite) vec = descr_elem.vector() if vec is None: log.error("Failed to generate a descriptor vector for the input data!") if output_filepath: numpy.save(output_filepath, vec) else: # Construct string, because numpy s = [] # noinspection PyTypeChecker for f in vec: s.append('%15f' % f) print ' '.join(s)
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: :param check_image: Enable checking image loading from file before queueing that file for processing. If the check fails, the file is skipped instead of a halting exception being raised. :type check_image: bool """ log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config(c['descriptor_index'], get_descriptor_index_impls()) data_set = None if c['optional_data_set']['type'] is None: log.info("Not saving loaded data elements to data set") else: log.info("Initializing data set to append to") #: :type: smqtk.representation.DataSet data_set = plugin.from_plugin_config(c['optional_data_set'], get_data_set_impls()) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) #: :type: smqtk.algorithms.DescriptorGenerator generator = plugin.from_plugin_config(c['descriptor_generator'], get_descriptor_generator_impls()) def iter_valid_elements(): def is_valid(file_path): dfe = DataFileElement(file_path) if is_valid_element( dfe, valid_content_types=generator.valid_content_types(), check_image=check_image): return dfe else: return False data_elements = collections.deque() valid_files_filter = parallel.parallel_map(is_valid, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe: yield dfe if data_set is not None: data_elements.append(dfe) if batch_size and len(data_elements) == batch_size: log.debug( "Adding data element batch to set (size: %d)", len(data_elements)) data_set.add_data(*data_elements) data_elements.clear() # elements only collected if we have a data-set configured, so add any # still in the deque to the set if data_elements: log.debug("Adding data elements to set (size: %d", len(data_elements)) data_set.add_data(*data_elements) log.info("Computing descriptors") m = compute_many_descriptors( iter_valid_elements(), generator, factory, descriptor_index, batch_size=batch_size, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'w') cf_writer = csv.writer(cf) try: rps = [0] * 7 for fp, descr in m: cf_writer.writerow([fp, descr.uuid()]) report_progress(log.debug, rps, 1.) finally: del cf_writer cf.close() log.info("Done")
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) # Descriptor factory setup self.log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory'] ) # Descriptor generator configuration labels #: :type: dict[str, dict] self.generator_config = self.json_config['descriptor_generator'] #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config( json_config['nn_index'], get_nn_index_impls ) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.generator_config, get_descriptor_generator_impls) @self.route("/nn/<path:uri>") @self.route("/nn/n=<int:n>/<path:uri>") @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>") def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None): """ Data modes for upload/use:: - local filepath - base64 - http/s URL The following sub-sections detail how different URI's can be used. Local Filepath -------------- The URI string must be prefixed with ``file://``, followed by the full path to the data file to describe. Base 64 data ------------ The URI string must be prefixed with "base64://", followed by the base64 encoded string. This mode also requires an additional ``?content_type=`` to provide data content type information. This mode saves the encoded data to temporary file for processing. HTTP/S address -------------- This is the default mode when the URI prefix is none of the above. This uses the requests module to locally download a data file for processing. JSON Return format:: { "success": <bool> "message": <str> "neighbors": <None|list[float]> "reference_uri": <str> } :type uri: str """ message = "execution nominal" descriptor = None de = None try: self.log.debug("Received URI: %s", uri) de = self.resolve_data_element(uri) except ValueError, ex: message = "URI resolution issue: %s" % str(ex) if de: try: descriptor = self.descriptor_generator_inst.\ compute_descriptor(de, self.descr_elem_factory) except RuntimeError, ex: message = "Descriptor extraction failure: %s" % str(ex) except ValueError, ex: message = "Data content type issue: %s" % str(ex)
import json from smqtk.representation import DescriptorElementFactory from smqtk.utils.bin_utils import logging, initialize_logging from smqtk.utils.jsmin import jsmin from load_algo import load_algo if not logging.getLogger().handlers: initialize_logging(logging.getLogger(), logging.DEBUG) log = logging.getLogger(__name__) log.info("Loading descriptor elements") d_type_str = open("descriptor_type_name.txt").read().strip() df_config = json.loads(jsmin(open('descriptor_factory_config.json').read())) factory = DescriptorElementFactory.from_config(df_config) # # Sample code for finding non-NaN descriptors in parallel # # def add_non_nan_uuid(uuid): # d = factory.new_descriptor(d_type_str, uuid) # if d.vector().sum() > 0: # return uuid # return None # # import multiprocessing # p = multiprocessing.Pool() # non_nan_uuids = \ # p.map(add_non_nan_uuid, # (l.strip() for l in open('descriptor_uuids.txt')))
from smqtk.representation import DescriptorElementFactory from smqtk.utils.bin_utils import logging, initialize_logging from smqtk.utils.jsmin import jsmin from load_algo import load_algo if not logging.getLogger().handlers: initialize_logging(logging.getLogger(), logging.DEBUG) log = logging.getLogger(__name__) log.info("Loading descriptor elements") d_type_str = open("descriptor_type_name.txt").read().strip() df_config = json.loads(jsmin(open('descriptor_factory_config.json').read())) factory = DescriptorElementFactory.from_config(df_config) # # Sample code for finding non-NaN descriptors in parallel # # def add_non_nan_uuid(uuid): # d = factory.new_descriptor(d_type_str, uuid) # if d.vector().sum() > 0: # return uuid # return None # # import multiprocessing # p = multiprocessing.Pool() # non_nan_uuids = \ # p.map(add_non_nan_uuid, # (l.strip() for l in open('descriptor_uuids.txt')))
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) self.update_index = json_config['update_descriptor_index'] # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory']) #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = None if self.update_index: self._log.info("Initializing DescriptorIndex to update") #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = plugin.from_plugin_config( json_config['descriptor_index'], get_descriptor_index_impls()) #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config(json_config['nn_index'], get_nn_index_impls()) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.json_config['descriptor_generator'], get_descriptor_generator_impls()) @self.route("/count", methods=['GET']) def count(): """ Return the number of elements represented in this index. """ return flask.jsonify(**{ "count": self.nn_index.count(), }) @self.route("/compute/<path:uri>", methods=["POST"]) def compute(uri): """ Compute the descriptor for a URI specified data element using the configured descriptor generator. If the a descriptor index was configured and update was turned on, we add the computed descriptor to the index. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :param uri: URI data specification. """ descriptor = None try: _, descriptor = self.generate_descriptor_for_uri(uri) message = "Descriptor generated" descriptor = map(float, descriptor.vector()) except ValueError, ex: message = "Input value issue: %s" % str(ex) except RuntimeError, ex: message = "Descriptor extraction failure: %s" % str(ex)
def classify_files(config, label, file_globs): log = logging.getLogger(__name__) #: :type: smqtk.algorithms.Classifier classifier = \ plugin.from_plugin_config(config['classifier'], get_classifier_impls()) def log_avaialable_labels(): log.info("Available classifier labels:") for l in classifier.get_labels(): log.info("- %s", l) if label is None: log_avaialable_labels() return elif label not in classifier.get_labels(): log.error( "Invalid classification label provided to compute and filter " "on: '%s'", label) log_avaialable_labels() return log.info("Collecting files from globs") #: :type: list[DataFileElement] data_elements = [] uuid2filepath = {} for g in file_globs: if os.path.isfile(g): d = DataFileElement(g) data_elements.append(d) uuid2filepath[d.uuid()] = g else: log.debug("expanding glob: %s", g) for fp in glob.iglob(g): d = DataFileElement(fp) data_elements.append(d) uuid2filepath[d.uuid()] = fp if not data_elements: raise RuntimeError("No files provided for classification.") log.info("Computing descriptors") descriptor_factory = \ DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(config['descriptor_generator'], get_descriptor_generator_impls()) descr_map = descriptor_generator\ .compute_descriptor_async(data_elements, descriptor_factory) log.info("Classifying descriptors") classification_factory = ClassificationElementFactory \ .from_config(config['classification_factory']) classification_map = classifier\ .classify_async(descr_map.values(), classification_factory) log.info("Printing input file paths that classified as the given label.") # map of UUID to filepath: uuid2c = dict((c.uuid, c) for c in classification_map.itervalues()) for data in data_elements: if uuid2c[data.uuid()].max_label() == label: print uuid2filepath[data.uuid()]
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) self.update_index = json_config['update_descriptor_index'] # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory']) #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = None if self.update_index: self._log.info("Initializing DescriptorIndex to update") #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = plugin.from_plugin_config( json_config['descriptor_index'], get_descriptor_index_impls()) #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config(json_config['nn_index'], get_nn_index_impls()) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.json_config['descriptor_generator'], get_descriptor_generator_impls()) @self.route("/count", methods=['GET']) def count(): """ Return the number of elements represented in this index. """ return flask.jsonify(**{ "count": self.nn_index.count(), }) @self.route("/compute/<path:uri>", methods=["POST"]) def compute(uri): """ Compute the descriptor for a URI specified data element using the configured descriptor generator. See ``compute_nearest_neighbors`` method docstring for URI specifications accepted. If the a descriptor index was configured and update was turned on, we add the computed descriptor to the index. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :param uri: URI data specification. """ descriptor = None try: descriptor = self.generate_descriptor_for_uri(uri) message = "Descriptor generated" descriptor = list(map(float, descriptor.vector())) except ValueError as ex: message = "Input value issue: %s" % str(ex) except RuntimeError as ex: message = "Descriptor extraction failure: %s" % str(ex) return flask.jsonify( success=descriptor is not None, message=message, descriptor=descriptor, reference_uri=uri, ) @self.route("/nn/<path:uri>") @self.route("/nn/n=<int:n>/<path:uri>") @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>") def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None): """ Data modes for upload/use: - local filepath - base64 - http/s URL - existing data/descriptor UUID The following sub-sections detail how different URI's can be used. Local Filepath -------------- The URI string must be prefixed with ``file://``, followed by the full path to the data file to describe. Base 64 data ------------ The URI string must be prefixed with "base64://", followed by the base64 encoded string. This mode also requires an additional ``?content_type=`` to provide data content type information. This mode saves the encoded data to temporary file for processing. HTTP/S address -------------- This is the default mode when the URI prefix is none of the above. This uses the requests module to locally download a data file for processing. Existing Data/Descriptor by UUID -------------------------------- When given a uri prefixed with "uuid://", we interpret the remainder of the uri as the UUID of a descriptor already present in the configured descriptor index. If the given UUID is not present in the index, a KeyError is raised. JSON Return format ------------------ { "success": <bool> "message": <str> "neighbors": <None|list[float]> "reference_uri": <str> } :param n: Number of neighbors to query for :param start_i: The starting index of the neighbor vectors to slice into for return. :param end_i: The ending index of the neighbor vectors to slice into for return. :type uri: str """ descriptor = None try: descriptor = self.generate_descriptor_for_uri(uri) message = "descriptor computed" except ValueError as ex: message = "Input data issue: %s" % str(ex) except RuntimeError as ex: message = "Descriptor generation failure: %s" % str(ex) # Base pagination slicing based on provided start and end indices, # otherwise clamp to beginning/ending of queried neighbor sequence. page_slice = slice(start_i or 0, end_i or n) neighbors = [] dists = [] if descriptor is not None: try: neighbors, dists = \ self.nn_index.nn(descriptor, n) except ValueError as ex: message = "Descriptor or index related issue: %s" % str(ex) # TODO: Return the optional descriptor vectors for the neighbors # noinspection PyTypeChecker d = { "success": bool(descriptor is not None), "message": message, "neighbors": [n.uuid() for n in neighbors[page_slice]], "distances": dists[page_slice], "reference_uri": uri } return flask.jsonify(d)
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) self.update_index = json_config['update_descriptor_index'] # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory'] ) #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = None if self.update_index: self._log.info("Initializing DescriptorIndex to update") #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = plugin.from_plugin_config( json_config['descriptor_index'], get_descriptor_index_impls() ) #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config( json_config['nn_index'], get_nn_index_impls() ) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.json_config['descriptor_generator'], get_descriptor_generator_impls() ) @self.route("/count", methods=['GET']) def count(): """ Return the number of elements represented in this index. """ return flask.jsonify(**{ "count": self.nn_index.count(), }) @self.route("/compute/<path:uri>", methods=["POST"]) def compute(uri): """ Compute the descriptor for a URI specified data element using the configured descriptor generator. See ``compute_nearest_neighbors`` method docstring for URI specifications accepted. If the a descriptor index was configured and update was turned on, we add the computed descriptor to the index. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :param uri: URI data specification. """ descriptor = None try: descriptor = self.generate_descriptor_for_uri(uri) message = "Descriptor generated" descriptor = list(map(float, descriptor.vector())) except ValueError as ex: message = "Input value issue: %s" % str(ex) except RuntimeError as ex: message = "Descriptor extraction failure: %s" % str(ex) return flask.jsonify( success=descriptor is not None, message=message, descriptor=descriptor, reference_uri=uri, ) @self.route("/nn/<path:uri>") @self.route("/nn/n=<int:n>/<path:uri>") @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>") def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None): """ Data modes for upload/use: - local filepath - base64 - http/s URL - existing data/descriptor UUID The following sub-sections detail how different URI's can be used. Local Filepath -------------- The URI string must be prefixed with ``file://``, followed by the full path to the data file to describe. Base 64 data ------------ The URI string must be prefixed with "base64://", followed by the base64 encoded string. This mode also requires an additional ``?content_type=`` to provide data content type information. This mode saves the encoded data to temporary file for processing. HTTP/S address -------------- This is the default mode when the URI prefix is none of the above. This uses the requests module to locally download a data file for processing. Existing Data/Descriptor by UUID -------------------------------- When given a uri prefixed with "uuid://", we interpret the remainder of the uri as the UUID of a descriptor already present in the configured descriptor index. If the given UUID is not present in the index, a KeyError is raised. JSON Return format ------------------ { "success": <bool> "message": <str> "neighbors": <None|list[float]> "reference_uri": <str> } :param n: Number of neighbors to query for :param start_i: The starting index of the neighbor vectors to slice into for return. :param end_i: The ending index of the neighbor vectors to slice into for return. :type uri: str """ descriptor = None try: descriptor = self.generate_descriptor_for_uri(uri) message = "descriptor computed" except ValueError as ex: message = "Input data issue: %s" % str(ex) except RuntimeError as ex: message = "Descriptor generation failure: %s" % str(ex) # Base pagination slicing based on provided start and end indices, # otherwise clamp to beginning/ending of queried neighbor sequence. page_slice = slice(start_i or 0, end_i or n) neighbors = [] dists = [] if descriptor is not None: try: neighbors, dists = \ self.nn_index.nn(descriptor, n) except ValueError as ex: message = "Descriptor or index related issue: %s" % str(ex) # TODO: Return the optional descriptor vectors for the neighbors # noinspection PyTypeChecker d = { "success": bool(descriptor is not None), "message": message, "neighbors": [n.uuid() for n in neighbors[page_slice]], "distances": dists[page_slice], "reference_uri": uri } return flask.jsonify(d)
def main(): parser = cli_parser() args = parser.parse_args() output_filepath = args.output_filepath overwrite = args.overwrite verbose = args.verbose llevel = logging.DEBUG if verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") # Merge loaded config with default config_loaded = False config = default_config() if args.config: if os.path.isfile(args.config): with open(args.config, 'r') as f: config.update(json.load(f)) config_loaded = True elif not os.path.isfile(args.config): log.error("Configuration file path not valid.") exit(1) bin_utils.output_config(args.output_config, config, log, True) # Configuration must have been loaded at this point since we can't normally # trust the default. if not config_loaded: log.error("No configuration provided") exit(1) if not args.input_file: log.error("Failed to provide an input file path") exit(1) elif not os.path.isfile(args.input_file): log.error("Given path does not point to a file.") exit(1) input_filepath = args.input_file data_element = DataFileElement(input_filepath) factory = DescriptorElementFactory.from_config( config['descriptor_factory']) #: :type: smqtk.algorithms.descriptor_generator.DescriptorGenerator cd = plugin.from_plugin_config(config['content_descriptor'], get_descriptor_generator_impls) descr_elem = cd.compute_descriptor(data_element, factory, overwrite) vec = descr_elem.vector() if vec is None: log.error("Failed to generate a descriptor vector for the input data!") if output_filepath: numpy.save(output_filepath, vec) else: # Construct string, because numpy s = [] # noinspection PyTypeChecker for f in vec: s.append('%15f' % f) print ' '.join(s)