예제 #1
0
def main():
    parser = cli_parser()
    args = parser.parse_args()
    config = cli.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    output_filepath = args.output_filepath
    overwrite = args.overwrite

    if not args.input_file:
        log.error("Failed to provide an input file path")
        exit(1)
    elif not os.path.isfile(args.input_file):
        log.error("Given path does not point to a file.")
        exit(1)

    input_filepath = args.input_file
    data_element = DataFileElement(input_filepath)

    factory = DescriptorElementFactory.from_config(config['descriptor_factory'])
    #: :type: smqtk.algorithms.descriptor_generator.DescriptorGenerator
    cd = from_config_dict(config['content_descriptor'],
                          DescriptorGenerator.get_impls())

    vec = generate_vector(log, cd, data_element, factory, overwrite)

    if output_filepath:
        numpy.save(output_filepath, vec)
    else:
        # Construct string, because numpy
        s = []
        # noinspection PyTypeChecker
        for f in vec:
            s.append('%15f' % f)
        print(' '.join(s))
예제 #2
0
파일: iqr_search.py 프로젝트: dhandeo/SMQTK
    def from_config(cls, config, parent_app):
        """
        Instantiate a new instance of this class given the configuration
        JSON-compliant dictionary encapsulating initialization arguments.

        :param config: JSON compliant dictionary encapsulating
            a configuration.
        :type config: dict

        :param parent_app: Parent containing flask app instance
        :type parent_app: smqtk.web.search_app.app.search_app

        :return: Constructed instance from the provided config.
        :rtype: IqrSearch

        """
        merged = cls.get_default_config()
        merged.update(config)

        # construct nested objects via configurations
        merged['data_set'] = \
            plugin.from_plugin_config(merged['data_set'],
                                      get_data_set_impls())
        merged['descr_generator'] = \
            plugin.from_plugin_config(merged['descr_generator'],
                                      get_descriptor_generator_impls())
        merged['nn_index'] = \
            plugin.from_plugin_config(merged['nn_index'],
                                      get_nn_index_impls())

        merged['descriptor_factory'] = \
            DescriptorElementFactory.from_config(merged['descriptor_factory'])

        return cls(parent_app, **merged)
예제 #3
0
    def from_config(cls, config_dict, type_str, uuid, merge_default=True):
        # convert factory configuration
        config_dict["wrapped_element_factory"] = DescriptorElementFactory.from_config(
            config_dict["wrapped_element_factory"]
        )

        return super(CachingDescriptorElement, cls).from_config(config_dict, type_str, uuid, merge_default)
예제 #4
0
    def from_config(cls, config, parent_app):
        """
        Instantiate a new instance of this class given the configuration
        JSON-compliant dictionary encapsulating initialization arguments.

        :param config: JSON compliant dictionary encapsulating
            a configuration.
        :type config: dict

        :param parent_app: Parent containing flask app instance
        :type parent_app: smqtk.web.search_app.app.search_app

        :return: Constructed instance from the provided config.
        :rtype: IqrSearch

        """
        merged = cls.get_default_config()
        merged.update(config)

        # construct nested objects via configurations
        merged['data_set'] = \
            plugin.from_plugin_config(merged['data_set'],
                                      get_data_set_impls())
        merged['descr_generator'] = \
            plugin.from_plugin_config(merged['descr_generator'],
                                      get_descriptor_generator_impls())
        merged['nn_index'] = \
            plugin.from_plugin_config(merged['nn_index'],
                                      get_nn_index_impls())

        merged['descriptor_factory'] = \
            DescriptorElementFactory.from_config(merged['descriptor_factory'])

        return cls(parent_app, **merged)
예제 #5
0
def run_file_list(c,
                  filelist_filepath,
                  checkpoint_filepath,
                  batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    :param check_image: Enable checking image loading from file before queueing
        that file for processing. If the check fails, the file is skipped
        instead of a halting exception being raised.
    :type check_image: bool

    """
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(c['descriptor_index'],
                                                 get_descriptor_index_impls())

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = plugin.from_plugin_config(c['descriptor_generator'],
                                          get_descriptor_generator_impls())

    def test_image_load(dfe):
        try:
            PIL.Image.open(io.BytesIO(dfe.get_bytes()))
            return True
        except IOError, ex:
            # noinspection PyProtectedMember
            log.warn(
                "Failed to convert '%s' bytes into an image "
                "(error: %s). Skipping", dfe._filepath, str(ex))
            return False
예제 #6
0
    def from_config(cls, config_dict, type_str, uuid, merge_default=True):
        # convert factory configuration
        config_dict['wrapped_element_factory'] = \
            DescriptorElementFactory.from_config(
                config_dict['wrapped_element_factory']
            )

        return super(CachingDescriptorElement,
                     cls).from_config(config_dict, type_str, uuid,
                                      merge_default)
예제 #7
0
def run_file_list(c, filelist_filepath, checkpoint_filepath):
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making memory factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = from_plugin_config(c['descriptor_generator'],
                                   get_descriptor_generator_impls)
    log.info("Making descriptor generator -- Done")

    valid_file_paths = dict()
    invalid_file_paths = dict()

    def iter_valid_files():
        for fp in file_paths:
            dfe = DataFileElement(fp)
            ct = dfe.content_type()
            if ct in generator.valid_content_types():
                valid_file_paths[fp] = ct
                yield fp
            else:
                invalid_file_paths[fp] = ct

    log.info("Computing descriptors")
    m = compute_many_descriptors(iter_valid_files(),
                                 generator,
                                 factory,
                                 batch_size=256,
                                 )

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, 'a')
    try:
        for fp, descr in m:
            cf.write("{:s},{:s}\n".format(
                fp, descr.uuid()
            ))
            cf.flush()
    finally:
        cf.close()

    # Output valid file and invalid file dictionaries as pickle
    log.info("Writing valid filepaths map")
    with open('valid_file_map.pickle', 'wb') as f:
        cPickle.dump(valid_file_paths, f)
    log.info("Writing invalid filepaths map")
    with open('invalid_file_map.pickle', 'wb') as f:
        cPickle.dump(invalid_file_paths, f)

    log.info("Done")
예제 #8
0
    def from_config(cls, config_dict, type_str, uuid):
        merged_config = cls.get_default_config()
        merged_config.update(config_dict)

        # convert factory configuration
        merged_config['wrapped_element_factory'] = \
            DescriptorElementFactory.from_config(
                merged_config['wrapped_element_factory']
            )

        return super(CachingDescriptorElement,
                     cls).from_config(merged_config, type_str, uuid)
예제 #9
0
    def __init__(self, json_config):
        super(SmqtkClassifierService, self).__init__(json_config)

        self.enable_classifier_removal = \
            bool(json_config[self.CONFIG_ENABLE_CLASSIFIER_REMOVAL])

        self.immutable_labels = set(json_config[self.CONFIG_IMMUTABLE_LABELS])

        # Convert configuration into SMQTK plugin instances.
        #   - Static classifier configurations.
        #       - Skip the example config key
        #   - Classification element factory
        #   - Descriptor generator
        #   - Descriptor element factory
        #   - from-IQR-state classifier configuration
        #       - There must at least be the default key defined for when no
        #         specific classifier type is specified at state POST.

        # Classifier collection + factor
        self.classification_factory = \
            ClassificationElementFactory.from_config(
                json_config[self.CONFIG_CLASSIFICATION_FACTORY]
            )
        #: :type: ClassifierCollection
        self.classifier_collection = ClassifierCollection.from_config(
            json_config[self.CONFIG_CLASSIFIER_COLLECTION]
        )

        # Descriptor generator + factory
        self.descriptor_factory = DescriptorElementFactory.from_config(
            json_config[self.CONFIG_DESCRIPTOR_FACTORY]
        )
        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_gen = from_config_dict(
            json_config[self.CONFIG_DESCRIPTOR_GENERATOR],
            smqtk.algorithms.DescriptorGenerator.get_impls()
        )

        # Descriptor set bundled for classification-by-UID.
        try:
            self.descriptor_set = from_config_dict(
                json_config.get(self.CONFIG_DESCRIPTOR_SET, {}),
                DescriptorSet.get_impls()
            )
        except ValueError:
            # Default empty set.
            self.descriptor_set = MemoryDescriptorSet()

        # Classifier config for uploaded IQR states.
        self.iqr_state_classifier_config = \
            json_config[self.CONFIG_IQR_CLASSIFIER]

        self.add_routes()
예제 #10
0
    def from_config(cls, config_dict, type_str, uuid):
        merged_config = cls.get_default_config()
        merged_config.update(config_dict)

        # convert factory configuration
        merged_config['wrapped_element_factory'] = \
            DescriptorElementFactory.from_config(
                merged_config['wrapped_element_factory']
            )

        return super(CachingDescriptorElement, cls).from_config(
            merged_config, type_str, uuid
        )
예제 #11
0
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    """
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(c['descriptor_index'],
                                                 get_descriptor_index_impls())

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = plugin.from_plugin_config(c['descriptor_generator'],
                                          get_descriptor_generator_impls())

    def test_image_load(dfe):
        try:
            PIL.Image.open(io.BytesIO(dfe.get_bytes()))
            return True
        except IOError, ex:
            # noinspection PyProtectedMember
            log.warn("Failed to convert '%s' bytes into an image "
                     "(error: %s). Skipping",
                     dfe._filepath, str(ex))
            return False
    def test_configuration(self):
        c = DescriptorElementFactory.get_default_config()
        ntools.assert_is_none(c['type'])
        ntools.assert_in('DescriptorMemoryElement', c)

        c['type'] = 'DescriptorMemoryElement'
        factory = DescriptorElementFactory.from_config(c)
        ntools.assert_equal(factory._d_type.__name__,
                            DescriptorMemoryElement.__name__)
        ntools.assert_equal(factory._d_type_config, {})

        d = factory.new_descriptor('test', 'foo')
        ntools.assert_equal(d.type(), 'test')
        ntools.assert_equal(d.uuid(), 'foo')
예제 #13
0
    def test_configuration(self):
        c = DescriptorElementFactory.get_default_config()
        self.assertIsNone(c['type'])
        self.assertIn('DescriptorMemoryElement', c)

        c['type'] = 'DescriptorMemoryElement'
        factory = DescriptorElementFactory.from_config(c)
        self.assertEqual(factory._d_type.__name__,
                         DescriptorMemoryElement.__name__)
        self.assertEqual(factory._d_type_config, {})

        d = factory.new_descriptor('test', 'foo')
        self.assertEqual(d.type(), 'test')
        self.assertEqual(d.uuid(), 'foo')
예제 #14
0
    def test_configuration(self):
        c = DescriptorElementFactory.get_default_config()
        ntools.assert_is_none(c['type'])
        ntools.assert_in('DescriptorMemoryElement', c)

        c['type'] = 'DescriptorMemoryElement'
        factory = DescriptorElementFactory.from_config(c)
        ntools.assert_equal(factory._d_type.__name__,
                            DescriptorMemoryElement.__name__)
        ntools.assert_equal(factory._d_type_config, {})

        d = factory.new_descriptor('test', 'foo')
        ntools.assert_equal(d.type(), 'test')
        ntools.assert_equal(d.uuid(), 'foo')
        DescriptorMemoryElement.MEMORY_CACHE = {}
    def test_configuration(self):
        c = DescriptorElementFactory.get_default_config()
        self.assertIsNone(c['type'])
        dme_key = 'smqtk.representation.descriptor_element.local_elements.DescriptorMemoryElement'
        self.assertIn(dme_key, c)

        c['type'] = dme_key
        factory = DescriptorElementFactory.from_config(c)
        self.assertEqual(factory._d_type.__name__,
                         DescriptorMemoryElement.__name__)
        self.assertEqual(factory._d_type_config, {})

        d = factory.new_descriptor('test', 'foo')
        self.assertEqual(d.type(), 'test')
        self.assertEqual(d.uuid(), 'foo')
예제 #16
0
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size):
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c["descriptor_factory"])

    log.info("Making descriptor generator '%s'", c["descriptor_generator"]["type"])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = from_plugin_config(c["descriptor_generator"], get_descriptor_generator_impls)
    log.info("Making descriptor generator -- Done")

    valid_file_paths = dict()
    invalid_file_paths = dict()

    def iter_valid_elements():
        for fp in file_paths:
            dfe = DataFileElement(fp)
            ct = dfe.content_type()
            if ct in generator.valid_content_types():
                valid_file_paths[fp] = ct
                yield dfe
            else:
                invalid_file_paths[fp] = ct

    log.info("Computing descriptors")
    m = compute_many_descriptors(iter_valid_elements(), generator, factory, batch_size=batch_size)

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, "a")
    try:
        for fp, descr in m:
            cf.write("{:s},{:s}\n".format(fp, descr.uuid()))
            cf.flush()
    finally:
        cf.close()

    # Output valid file and invalid file dictionaries as pickle
    log.info("Writing valid filepaths map")
    with open("file_map.valid.pickle", "wb") as f:
        cPickle.dump(valid_file_paths, f)
    log.info("Writing invalid filepaths map")
    with open("file_map.invalid.pickle", "wb") as f:
        cPickle.dump(invalid_file_paths, f)

    log.info("Done")
예제 #17
0
    def __init__(self, json_config):
        super(SmqtkClassifierService, self).__init__(json_config)

        self.enable_classifier_removal = \
            bool(json_config[self.CONFIG_ENABLE_CLASSIFIER_REMOVAL])

        self.immutable_labels = set(json_config[self.CONFIG_IMMUTABLE_LABELS])

        # Convert configuration into SMQTK plugin instances.
        #   - Static classifier configurations.
        #       - Skip the example config key
        #   - Classification element factory
        #   - Descriptor generator
        #   - Descriptor element factory
        #   - from-IQR-state classifier configuration
        #       - There must at least be the default key defined for when no
        #         specific classifier type is specified at state POST.

        # Classifier collection + factor
        self.classification_factory = \
            ClassificationElementFactory.from_config(
                json_config[self.CONFIG_CLASSIFICATION_FACTORY]
            )
        self.classifier_collection = ClassifierCollection.from_config(
            json_config[self.CONFIG_CLASSIFIER_COLLECTION]
        )

        # Descriptor generator + factory
        self.descriptor_factory = DescriptorElementFactory.from_config(
            json_config[self.CONFIG_DESCRIPTOR_FACTORY]
        )
        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_gen = smqtk.utils.plugin.from_plugin_config(
            json_config[self.CONFIG_DESCRIPTOR_GENERATOR],
            smqtk.algorithms.get_descriptor_generator_impls()
        )

        # Classifier config for uploaded IQR states.
        self.iqr_state_classifier_config = \
            json_config[self.CONFIG_IQR_CLASSIFIER]

        self.add_routes()
예제 #18
0
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(DescriptorServiceServer, self).__init__(json_config)

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory'])

        # Descriptor generator configuration labels
        #: :type: dict[str, dict]
        self.generator_label_configs = self.json_config[
            'descriptor_generators']

        # Cache of DescriptorGenerator instances so we don't have to
        # continuously initialize them as we get requests.
        self.descriptor_cache = {}
        self.descriptor_cache_lock = multiprocessing.RLock()

        @self.route("/")
        def list_ingest_labels():
            return flask.jsonify(
                {"labels": sorted(self.generator_label_configs)})

        @self.route("/all/content_types")
        def all_content_types():
            """
            Of available descriptors, what content types are processable, and
            what types are associated to which available descriptor generator.
            """
            all_types = set()
            # Mapping of configuration label to content types that generator
            # can handle
            r = {}
            for l in self.generator_label_configs:
                d = self.get_descriptor_inst(l)
                all_types.update(d.valid_content_types())
                r[l] = sorted(d.valid_content_types())

            return flask.jsonify({"all": sorted(all_types), "by-label": r})

        @self.route("/all/compute/<path:uri>")
        def all_compute(uri):
            """
            Compute descriptors over the specified content for all generators
            that function over the data's content type.

            JSON Return format::

                {
                    "success": <bool>

                    "content_type": <str>

                    "message": <str>

                    "descriptors": {  "<label>":  <list[float]>, ... } | None

                    "reference_uri": <str>
                }

            """
            message = "execution nominal"

            data_elem = None
            try:
                data_elem = self.resolve_data_element(uri)
            except ValueError as ex:
                message = "Failed URI resolution: %s" % str(ex)

            descriptors = {}
            finished_loop = False
            if data_elem:
                for l in self.generator_label_configs:
                    if data_elem.content_type() in \
                            self.get_descriptor_inst(l).valid_content_types():
                        d = None
                        try:
                            d = self.generate_descriptor(data_elem, l)
                        except RuntimeError as ex:
                            message = "Descriptor extraction failure: %s" \
                                      % str(ex)
                        except ValueError as ex:
                            message = "Data content type issue: %s" % str(ex)

                        descriptors[l] = d and d.vector().tolist()
                if not descriptors:
                    message = "No descriptors can handle URI content type: %s" \
                              % data_elem.content_type
                else:
                    finished_loop = True

            return flask.jsonify({
                "success": finished_loop,
                "content_type": data_elem.content_type(),
                "message": message,
                "descriptors": descriptors,
                "reference_uri": uri
            })

        @self.route("/<string:descriptor_label>/<path:uri>")
        def compute_descriptor(descriptor_label, uri):
            """

            Data modes for upload/use::

                - local filepath
                - base64
                - http/s URL

            The following sub-sections detail how different URI's can be used.

            Local Filepath
            --------------

            The URI string must be prefixed with ``file://``, followed by the
            full path to the data file to describe.

            Base 64 data
            ------------

            The URI string must be prefixed with "base64://", followed by the
            base64 encoded string. This mode also requires an additional
            ``?content_type=`` to provide data content type information. This
            mode saves the encoded data to temporary file for processing.

            HTTP/S address
            --------------

            This is the default mode when the URI prefix is none of the above.
            This uses the requests module to locally download a data file
            for processing.

            JSON Return format::

                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :type descriptor_label: str
            :type uri: str

            """
            message = "execution nominal"
            descriptor = None

            de = None
            try:
                de = self.resolve_data_element(uri)
            except ValueError as ex:
                message = "URI resolution issue: %s" % str(ex)

            if de:
                try:
                    descriptor = self.generate_descriptor(de, descriptor_label)
                except RuntimeError as ex:
                    message = "Descriptor extraction failure: %s" % str(ex)
                except ValueError as ex:
                    message = "Data content type issue: %s" % str(ex)

            return flask.jsonify({
                "success":
                descriptor is not None,
                "message":
                message,
                "descriptor":
                (descriptor is not None and descriptor.vector().tolist())
                or None,
                "reference_uri":
                uri
            })
예제 #19
0
def run_file_list(c,
                  filelist_filepath,
                  checkpoint_filepath,
                  batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    :param check_image: Enable checking image loading from file before queueing
        that file for processing. If the check fails, the file is skipped
        instead of a halting exception being raised.
    :type check_image: bool

    """
    log = logging.getLogger(__name__)

    file_paths = [line.strip() for line in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    descriptor_set = cast(
        DescriptorSet,
        from_config_dict(c['descriptor_set'], DescriptorSet.get_impls()))

    # ``data_set`` added to within the ``iter_valid_elements`` function.
    data_set: Optional[DataSet] = None
    if c['optional_data_set']['type'] is None:
        log.info("Not saving loaded data elements to data set")
    else:
        log.info("Initializing data set to append to")
        data_set = cast(
            DataSet,
            from_config_dict(c['optional_data_set'], DataSet.get_impls()))

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    generator = cast(
        DescriptorGenerator,
        from_config_dict(c['descriptor_generator'],
                         DescriptorGenerator.get_impls()))

    def iter_valid_elements():
        def is_valid(file_path):
            e = DataFileElement(file_path)

            if is_valid_element(
                    e,
                    valid_content_types=generator.valid_content_types(),
                    check_image=check_image):
                return e
            else:
                return False

        data_elements: Deque[DataFileElement] = collections.deque()
        valid_files_filter = parallel.parallel_map(is_valid,
                                                   file_paths,
                                                   name="check-file-type",
                                                   use_multiprocessing=True)
        for dfe in valid_files_filter:
            if dfe:
                yield dfe
                if data_set is not None:
                    data_elements.append(dfe)
                    if batch_size and len(data_elements) == batch_size:
                        log.debug(
                            "Adding data element batch to set (size: %d)",
                            len(data_elements))
                        data_set.add_data(*data_elements)
                        data_elements.clear()
        # elements only collected if we have a data-set configured, so add any
        # still in the deque to the set
        if data_set is not None and data_elements:
            log.debug("Adding data elements to set (size: %d",
                      len(data_elements))
            data_set.add_data(*data_elements)

    log.info("Computing descriptors")
    m = compute_many_descriptors(
        iter_valid_elements(),
        generator,
        factory,
        descriptor_set,
        batch_size=batch_size,
    )

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, 'w')
    cf_writer = csv.writer(cf)
    try:
        pr = ProgressReporter(log.debug, 1.0).start()
        for de, descr in m:
            # We know that we are using DataFileElements going into the
            # compute_many_descriptors, so we can assume that's what comes out
            # of it as well.
            # noinspection PyProtectedMember
            cf_writer.writerow([de._filepath, descr.uuid()])
            pr.increment_report()
        pr.report()
    finally:
        del cf_writer
        cf.close()

    log.info("Done")
예제 #20
0
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(DescriptorServiceServer, self).__init__(json_config)

        # Descriptor factory setup
        self.log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory']
        )

        # Descriptor generator configuration labels
        #: :type: dict[str, dict]
        self.generator_label_configs = self.json_config['descriptor_generators']

        # Cache of DescriptorGenerator instances so we don't have to continuously
        # initialize them as we get requests.
        self.descriptor_cache = {}
        self.descriptor_cache_lock = multiprocessing.RLock()

        @self.route("/")
        def list_ingest_labels():
            return flask.jsonify({
                "labels": sorted(self.generator_label_configs.iterkeys())
            })

        @self.route("/all/content_types")
        def all_content_types():
            """
            Of available descriptors, what content types are processable, and
            what types are associated to which available descriptor generator.
            """
            all_types = set()
            # Mapping of configuration label to content types that generator
            # can handle
            r = {}
            for l in self.generator_label_configs:
                d = self.get_descriptor_inst(l)
                all_types.update(d.valid_content_types())
                r[l] = sorted(d.valid_content_types())

            return flask.jsonify({
                "all": sorted(all_types),
                "by-label": r
            })

        @self.route("/all/compute/<path:uri>")
        def all_compute(uri):
            """
            Compute descriptors over the specified content for all generators
            that function over the data's content type.

            JSON Return format::

                {
                    "success": <bool>

                    "content_type": <str>

                    "message": <str>

                    "descriptors": {  "<label>":  <list[float]>, ... } | None

                    "reference_uri": <str>
                }

            """
            message = "execution nominal"

            data_elem = None
            try:
                data_elem = self.resolve_data_element(uri)
            except ValueError, ex:
                message = "Failed URI resolution: %s" % str(ex)

            descriptors = {}
            finished_loop = False
            if data_elem:
                for l in self.generator_label_configs:
                    if data_elem.content_type() \
                            in self.get_descriptor_inst(l).valid_content_types():
                        d = None
                        try:
                            d = self.generate_descriptor(data_elem, l)
                        except RuntimeError, ex:
                            message = "Descriptor extraction failure: %s" \
                                      % str(ex)
                        except ValueError, ex:
                            message = "Data content type issue: %s" % str(ex)

                        descriptors[l] = d and d.vector().tolist()
예제 #21
0
def main():
    parser = cli_parser()
    args = parser.parse_args()

    output_filepath = args.output_filepath
    overwrite = args.overwrite
    verbose = args.verbose

    llevel = logging.DEBUG if verbose else logging.INFO
    bin_utils.initialize_logging(logging.getLogger(), llevel)
    log = logging.getLogger("main")

    # Merge loaded config with default
    config_loaded = False
    config = default_config()
    if args.config:
        if os.path.isfile(args.config):
            with open(args.config, 'r') as f:
                config.update(json.load(f))
            config_loaded = True
        elif not os.path.isfile(args.config):
            log.error("Configuration file path not valid.")
            exit(1)

    bin_utils.output_config(args.output_config, config, log, True)

    # Configuration must have been loaded at this point since we can't normally
    # trust the default.
    if not config_loaded:
        log.error("No configuration provided")
        exit(1)

    if not args.input_file:
        log.error("Failed to provide an input file path")
        exit(1)
    elif not os.path.isfile(args.input_file):
        log.error("Given path does not point to a file.")
        exit(1)

    input_filepath = args.input_file
    data_element = DataFileElement(input_filepath)

    factory = DescriptorElementFactory.from_config(config['descriptor_factory'])
    #: :type: smqtk.algorithms.descriptor_generator.DescriptorGenerator
    cd = plugin.from_plugin_config(config['content_descriptor'],
                                   get_descriptor_generator_impls())
    descr_elem = cd.compute_descriptor(data_element, factory, overwrite)
    vec = descr_elem.vector()

    if vec is None:
        log.error("Failed to generate a descriptor vector for the input data!")

    if output_filepath:
        numpy.save(output_filepath, vec)
    else:
        # Construct string, because numpy
        s = []
        # noinspection PyTypeChecker
        for f in vec:
            s.append('%15f' % f)
        print ' '.join(s)
예제 #22
0
파일: __init__.py 프로젝트: msarahan/SMQTK
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        # Descriptor factory setup
        self.log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory'])

        # Descriptor generator configuration labels
        #: :type: dict[str, dict]
        self.generator_config = self.json_config['descriptor_generator']

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(json_config['nn_index'],
                                                  get_nn_index_impls)

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.generator_config, get_descriptor_generator_impls)

        @self.route("/nn/<path:uri>")
        @self.route("/nn/n=<int:n>/<path:uri>")
        @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>")
        def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None):
            """
            Data modes for upload/use::

                - local filepath
                - base64
                - http/s URL

            The following sub-sections detail how different URI's can be used.

            Local Filepath
            --------------

            The URI string must be prefixed with ``file://``, followed by the
            full path to the data file to describe.

            Base 64 data
            ------------

            The URI string must be prefixed with "base64://", followed by the
            base64 encoded string. This mode also requires an additional
            ``?content_type=`` to provide data content type information. This
            mode saves the encoded data to temporary file for processing.

            HTTP/S address
            --------------

            This is the default mode when the URI prefix is none of the above.
            This uses the requests module to locally download a data file
            for processing.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "neighbors": <None|list[float]>

                    "reference_uri": <str>
                }

            :type uri: str

            """
            message = "execution nominal"
            descriptor = None

            de = None
            try:
                self.log.debug("Received URI: %s", uri)
                de = self.resolve_data_element(uri)
            except ValueError, ex:
                message = "URI resolution issue: %s" % str(ex)

            if de:
                try:
                    descriptor = self.descriptor_generator_inst.\
                        compute_descriptor(de, self.descr_elem_factory)
                except RuntimeError, ex:
                    message = "Descriptor extraction failure: %s" % str(ex)
                except ValueError, ex:
                    message = "Data content type issue: %s" % str(ex)
예제 #23
0
def classify_files(config, label, file_globs):
    log = logging.getLogger(__name__)

    #: :type: smqtk.algorithms.Classifier
    classifier = \
        plugin.from_plugin_config(config['classifier'],
                                  get_classifier_impls())

    def log_avaialable_labels():
        log.info("Available classifier labels:")
        for l in classifier.get_labels():
            log.info("- %s", l)

    if label is None:
        log_avaialable_labels()
        return
    elif label not in classifier.get_labels():
        log.error("Invalid classification label provided to compute and filter "
                  "on: '%s'", label)
        log_avaialable_labels()
        return

    log.info("Collecting files from globs")
    #: :type: list[DataFileElement]
    data_elements = []
    uuid2filepath = {}
    for g in file_globs:
        if os.path.isfile(g):
            d = DataFileElement(g)
            data_elements.append(d)
            uuid2filepath[d.uuid()] = g
        else:
            log.debug("expanding glob: %s", g)
            for fp in glob.iglob(g):
                d = DataFileElement(fp)
                data_elements.append(d)
                uuid2filepath[d.uuid()] = fp
    if not data_elements:
        raise RuntimeError("No files provided for classification.")

    log.info("Computing descriptors")
    descriptor_factory = \
        DescriptorElementFactory.from_config(config['descriptor_factory'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    descriptor_generator = \
        plugin.from_plugin_config(config['descriptor_generator'],
                                  get_descriptor_generator_impls())
    descr_map = descriptor_generator\
        .compute_descriptor_async(data_elements, descriptor_factory)

    log.info("Classifying descriptors")
    classification_factory = ClassificationElementFactory \
        .from_config(config['classification_factory'])
    classification_map = classifier\
        .classify_async(list(descr_map.values()), classification_factory)

    log.info("Printing input file paths that classified as the given label.")
    # map of UUID to filepath:
    uuid2c = dict((c.uuid, c) for c in six.itervalues(classification_map))
    for data in data_elements:
        d_uuid = data.uuid()
        log.debug("'{}' classification map: {}".format(
            uuid2filepath[d_uuid], uuid2c[d_uuid].get_classification()
        ))
        if uuid2c[d_uuid].max_label() == label:
            print(uuid2filepath[d_uuid])
예제 #24
0
파일: __init__.py 프로젝트: Kitware/SMQTK
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(DescriptorServiceServer, self).__init__(json_config)

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory']
        )

        # Descriptor generator configuration labels
        #: :type: dict[str, dict]
        self.generator_label_configs = self.json_config['descriptor_generators']

        # Cache of DescriptorGenerator instances so we don't have to
        # continuously initialize them as we get requests.
        self.descriptor_cache = {}
        self.descriptor_cache_lock = multiprocessing.RLock()

        @self.route("/")
        def list_ingest_labels():
            return flask.jsonify({
                "labels": sorted(self.generator_label_configs)
            })

        @self.route("/all/content_types")
        def all_content_types():
            """
            Of available descriptors, what content types are processable, and
            what types are associated to which available descriptor generator.
            """
            all_types = set()
            # Mapping of configuration label to content types that generator
            # can handle
            r = {}
            for l in self.generator_label_configs:
                d = self.get_descriptor_inst(l)
                all_types.update(d.valid_content_types())
                r[l] = sorted(d.valid_content_types())

            return flask.jsonify({
                "all": sorted(all_types),
                "by-label": r
            })

        @self.route("/all/compute/<path:uri>")
        def all_compute(uri):
            """
            Compute descriptors over the specified content for all generators
            that function over the data's content type.

            JSON Return format::

                {
                    "success": <bool>

                    "content_type": <str>

                    "message": <str>

                    "descriptors": {  "<label>":  <list[float]>, ... } | None

                    "reference_uri": <str>
                }

            """
            message = "execution nominal"

            data_elem = None
            try:
                data_elem = self.resolve_data_element(uri)
            except ValueError as ex:
                message = "Failed URI resolution: %s" % str(ex)

            descriptors = {}
            finished_loop = False
            if data_elem:
                for l in self.generator_label_configs:
                    if data_elem.content_type() in \
                            self.get_descriptor_inst(l).valid_content_types():
                        d = None
                        try:
                            d = self.generate_descriptor(data_elem, l)
                        except RuntimeError as ex:
                            message = "Descriptor extraction failure: %s" \
                                      % str(ex)
                        except ValueError as ex:
                            message = "Data content type issue: %s" % str(ex)

                        descriptors[l] = d and d.vector().tolist()
                if not descriptors:
                    message = "No descriptors can handle URI content type: %s" \
                              % data_elem.content_type
                else:
                    finished_loop = True

            return flask.jsonify({
                "success": finished_loop,
                "content_type": data_elem.content_type(),
                "message": message,
                "descriptors": descriptors,
                "reference_uri": uri
            })

        @self.route("/<string:descriptor_label>/<path:uri>")
        def compute_descriptor(descriptor_label, uri):
            """

            Data modes for upload/use::

                - local filepath
                - base64
                - http/s URL

            The following sub-sections detail how different URI's can be used.

            Local Filepath
            --------------

            The URI string must be prefixed with ``file://``, followed by the
            full path to the data file to describe.

            Base 64 data
            ------------

            The URI string must be prefixed with "base64://", followed by the
            base64 encoded string. This mode also requires an additional
            ``?content_type=`` to provide data content type information. This
            mode saves the encoded data to temporary file for processing.

            HTTP/S address
            --------------

            This is the default mode when the URI prefix is none of the above.
            This uses the requests module to locally download a data file
            for processing.

            JSON Return format::

                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :type descriptor_label: str
            :type uri: str

            """
            message = "execution nominal"
            descriptor = None

            de = None
            try:
                de = self.resolve_data_element(uri)
            except ValueError as ex:
                message = "URI resolution issue: %s" % str(ex)

            if de:
                try:
                    descriptor = self.generate_descriptor(de, descriptor_label)
                except RuntimeError as ex:
                    message = "Descriptor extraction failure: %s" % str(ex)
                except ValueError as ex:
                    message = "Data content type issue: %s" % str(ex)

            return flask.jsonify({
                "success": descriptor is not None,
                "message": message,
                "descriptor":
                    (descriptor is not None and descriptor.vector().tolist())
                    or None,
                "reference_uri": uri
            })
예제 #25
0
파일: __init__.py 프로젝트: dhandeo/SMQTK
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        self.update_index = json_config['update_descriptor_index']

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory']
        )

        #: :type: smqtk.representation.DescriptorIndex | None
        self.descr_index = None
        if self.update_index:
            self._log.info("Initializing DescriptorIndex to update")
            #: :type: smqtk.representation.DescriptorIndex | None
            self.descr_index = plugin.from_plugin_config(
                json_config['descriptor_index'],
                get_descriptor_index_impls()
            )

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(
            json_config['nn_index'],
            get_nn_index_impls()
        )

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.json_config['descriptor_generator'],
            get_descriptor_generator_impls()
        )

        @self.route("/count", methods=['GET'])
        def count():
            """
            Return the number of elements represented in this index.
            """
            return flask.jsonify(**{
                "count": self.nn_index.count(),
            })

        @self.route("/compute/<path:uri>", methods=["POST"])
        def compute(uri):
            """
            Compute the descriptor for a URI specified data element using the
            configured descriptor generator.

            If the a descriptor index was configured and update was turned on,
            we add the computed descriptor to the index.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :param uri: URI data specification.

            """
            descriptor = None
            try:
                _, descriptor = self.generate_descriptor_for_uri(uri)
                message = "Descriptor generated"
                descriptor = map(float, descriptor.vector())
            except ValueError, ex:
                message = "Input value issue: %s" % str(ex)
            except RuntimeError, ex:
                message = "Descriptor extraction failure: %s" % str(ex)
예제 #26
0
def main():
    usage = "%prog [OPTIONS] INPUT_FILE"
    description = """\
Compute a descriptor vector for a given data file, outputting the generated
feature vector to standard out, or to an output file if one was specified (in
numpy format).
"""
    parser = bin_utils.SMQTKOptParser(usage, description=description)

    group_labels = optparse.OptionGroup(parser, "Configuration")
    group_labels.add_option('-c', '--config',
                            default=None,
                            help='Path to the JSON configuration file.')
    group_labels.add_option('--output-config',
                            default=None,
                            help='Optional path to output default JSON '
                                 'configuration to.')
    parser.add_option_group(group_labels)

    group_optional = optparse.OptionGroup(parser, "Optional Parameters")
    group_optional.add_option('--overwrite',
                              action='store_true', default=False,
                              help="Force descriptor computation even if an "
                                   "existing descriptor vector was discovered "
                                   "based on the given content descriptor type "
                                   "and data combination.")
    group_optional.add_option('-o', '--output-filepath',
                              help='Optional path to a file to output feature '
                                   'vector to. Otherwise the feature vector is '
                                   'printed to standard out. Output is saved '
                                   'in numpy binary format (.npy suffix '
                                   'recommended).')
    group_optional.add_option('-v', '--verbose',
                              action='store_true', default=False,
                              help='Print additional debugging messages. All '
                                   'logging goes to standard error.')
    parser.add_option_group(group_optional)

    opts, args = parser.parse_args()

    output_filepath = opts.output_filepath
    overwrite = opts.overwrite
    verbose = opts.verbose

    llevel = logging.DEBUG if verbose else logging.INFO
    bin_utils.initialize_logging(logging.getLogger(), llevel)
    log = logging.getLogger("main")

    bin_utils.output_config(opts.output_config, default_config(), log)

    if not opts.config:
        log.error("No configuration provided")
        exit(1)
    elif not os.path.isfile(opts.config):
        log.error("Configuration file path not valid.")
        exit(1)

    if len(args) == 0:
        log.error("Failed to provide an input file path")
        exit(1)
    if len(args) > 1:
        log.warning("More than one filepath provided as an argument. Only "
                    "computing for the first one.")

    with open(opts.config, 'r') as f:
        config = json.load(f)

    input_filepath = args[0]
    data_element = DataFileElement(input_filepath)

    factory = DescriptorElementFactory.from_config(config['descriptor_factory'])
    #: :type: smqtk.descriptor_generator.DescriptorGenerator
    cd = plugin.from_plugin_config(config['content_descriptor'],
                                   get_descriptor_generator_impls)
    descr_elem = cd.compute_descriptor(data_element, factory, overwrite)
    vec = descr_elem.vector()

    if vec is None:
        log.error("Failed to generate a descriptor vector for the input data!")

    if output_filepath:
        numpy.save(output_filepath, vec)
    else:
        # Construct string, because numpy
        s = []
        # noinspection PyTypeChecker
        for f in vec:
            s.append('%15f' % f)
        print ' '.join(s)
예제 #27
0
def run_file_list(c,
                  filelist_filepath,
                  checkpoint_filepath,
                  batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    :param check_image: Enable checking image loading from file before queueing
        that file for processing. If the check fails, the file is skipped
        instead of a halting exception being raised.
    :type check_image: bool

    """
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(c['descriptor_index'],
                                                 get_descriptor_index_impls())

    data_set = None
    if c['optional_data_set']['type'] is None:
        log.info("Not saving loaded data elements to data set")
    else:
        log.info("Initializing data set to append to")
        #: :type: smqtk.representation.DataSet
        data_set = plugin.from_plugin_config(c['optional_data_set'],
                                             get_data_set_impls())

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = plugin.from_plugin_config(c['descriptor_generator'],
                                          get_descriptor_generator_impls())

    def iter_valid_elements():
        def is_valid(file_path):
            dfe = DataFileElement(file_path)

            if is_valid_element(
                    dfe,
                    valid_content_types=generator.valid_content_types(),
                    check_image=check_image):
                return dfe
            else:
                return False

        data_elements = collections.deque()
        valid_files_filter = parallel.parallel_map(is_valid,
                                                   file_paths,
                                                   name="check-file-type",
                                                   use_multiprocessing=True)
        for dfe in valid_files_filter:
            if dfe:
                yield dfe
                if data_set is not None:
                    data_elements.append(dfe)
                    if batch_size and len(data_elements) == batch_size:
                        log.debug(
                            "Adding data element batch to set (size: %d)",
                            len(data_elements))
                        data_set.add_data(*data_elements)
                        data_elements.clear()
        # elements only collected if we have a data-set configured, so add any
        # still in the deque to the set
        if data_elements:
            log.debug("Adding data elements to set (size: %d",
                      len(data_elements))
            data_set.add_data(*data_elements)

    log.info("Computing descriptors")
    m = compute_many_descriptors(
        iter_valid_elements(),
        generator,
        factory,
        descriptor_index,
        batch_size=batch_size,
    )

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, 'w')
    cf_writer = csv.writer(cf)
    try:
        rps = [0] * 7
        for fp, descr in m:
            cf_writer.writerow([fp, descr.uuid()])
            report_progress(log.debug, rps, 1.)
    finally:
        del cf_writer
        cf.close()

    log.info("Done")
예제 #28
0
파일: __init__.py 프로젝트: msarahan/SMQTK
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        # Descriptor factory setup
        self.log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory']
        )

        # Descriptor generator configuration labels
        #: :type: dict[str, dict]
        self.generator_config = self.json_config['descriptor_generator']

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(
            json_config['nn_index'],
            get_nn_index_impls
        )

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
                                            self.generator_config,
                                            get_descriptor_generator_impls)

        @self.route("/nn/<path:uri>")
        @self.route("/nn/n=<int:n>/<path:uri>")
        @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>")
        def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None):
            """
            Data modes for upload/use::

                - local filepath
                - base64
                - http/s URL

            The following sub-sections detail how different URI's can be used.

            Local Filepath
            --------------

            The URI string must be prefixed with ``file://``, followed by the
            full path to the data file to describe.

            Base 64 data
            ------------

            The URI string must be prefixed with "base64://", followed by the
            base64 encoded string. This mode also requires an additional
            ``?content_type=`` to provide data content type information. This
            mode saves the encoded data to temporary file for processing.

            HTTP/S address
            --------------

            This is the default mode when the URI prefix is none of the above.
            This uses the requests module to locally download a data file
            for processing.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "neighbors": <None|list[float]>

                    "reference_uri": <str>
                }

            :type uri: str

            """
            message = "execution nominal"
            descriptor = None

            de = None
            try:
                self.log.debug("Received URI: %s", uri)
                de = self.resolve_data_element(uri)
            except ValueError, ex:
                message = "URI resolution issue: %s" % str(ex)

            if de:
                try:
                    descriptor = self.descriptor_generator_inst.\
                        compute_descriptor(de, self.descr_elem_factory)
                except RuntimeError, ex:
                    message = "Descriptor extraction failure: %s" % str(ex)
                except ValueError, ex:
                    message = "Data content type issue: %s" % str(ex)
예제 #29
0
import json

from smqtk.representation import DescriptorElementFactory
from smqtk.utils.bin_utils import logging, initialize_logging
from smqtk.utils.jsmin import jsmin

from load_algo import load_algo

if not logging.getLogger().handlers:
    initialize_logging(logging.getLogger(), logging.DEBUG)
log = logging.getLogger(__name__)

log.info("Loading descriptor elements")
d_type_str = open("descriptor_type_name.txt").read().strip()
df_config = json.loads(jsmin(open('descriptor_factory_config.json').read()))
factory = DescriptorElementFactory.from_config(df_config)

#
# Sample code for finding non-NaN descriptors in parallel
#
# def add_non_nan_uuid(uuid):
#     d = factory.new_descriptor(d_type_str, uuid)
#     if d.vector().sum() > 0:
#         return uuid
#     return None
#
# import multiprocessing
# p = multiprocessing.Pool()
# non_nan_uuids = \
#     p.map(add_non_nan_uuid,
#           (l.strip() for l in open('descriptor_uuids.txt')))
예제 #30
0
from smqtk.representation import DescriptorElementFactory
from smqtk.utils.bin_utils import logging, initialize_logging
from smqtk.utils.jsmin import jsmin

from load_algo import load_algo


if not logging.getLogger().handlers:
    initialize_logging(logging.getLogger(), logging.DEBUG)
log = logging.getLogger(__name__)


log.info("Loading descriptor elements")
d_type_str = open("descriptor_type_name.txt").read().strip()
df_config = json.loads(jsmin(open('descriptor_factory_config.json').read()))
factory = DescriptorElementFactory.from_config(df_config)

#
# Sample code for finding non-NaN descriptors in parallel
#
# def add_non_nan_uuid(uuid):
#     d = factory.new_descriptor(d_type_str, uuid)
#     if d.vector().sum() > 0:
#         return uuid
#     return None
#
# import multiprocessing
# p = multiprocessing.Pool()
# non_nan_uuids = \
#     p.map(add_non_nan_uuid,
#           (l.strip() for l in open('descriptor_uuids.txt')))
예제 #31
0
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        self.update_index = json_config['update_descriptor_index']

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory'])

        #: :type: smqtk.representation.DescriptorIndex | None
        self.descr_index = None
        if self.update_index:
            self._log.info("Initializing DescriptorIndex to update")
            #: :type: smqtk.representation.DescriptorIndex | None
            self.descr_index = plugin.from_plugin_config(
                json_config['descriptor_index'], get_descriptor_index_impls())

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(json_config['nn_index'],
                                                  get_nn_index_impls())

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.json_config['descriptor_generator'],
            get_descriptor_generator_impls())

        @self.route("/count", methods=['GET'])
        def count():
            """
            Return the number of elements represented in this index.
            """
            return flask.jsonify(**{
                "count": self.nn_index.count(),
            })

        @self.route("/compute/<path:uri>", methods=["POST"])
        def compute(uri):
            """
            Compute the descriptor for a URI specified data element using the
            configured descriptor generator.

            If the a descriptor index was configured and update was turned on,
            we add the computed descriptor to the index.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :param uri: URI data specification.

            """
            descriptor = None
            try:
                _, descriptor = self.generate_descriptor_for_uri(uri)
                message = "Descriptor generated"
                descriptor = map(float, descriptor.vector())
            except ValueError, ex:
                message = "Input value issue: %s" % str(ex)
            except RuntimeError, ex:
                message = "Descriptor extraction failure: %s" % str(ex)
예제 #32
0
def classify_files(config, label, file_globs):
    log = logging.getLogger(__name__)

    #: :type: smqtk.algorithms.Classifier
    classifier = \
        plugin.from_plugin_config(config['classifier'],
                                  get_classifier_impls())

    def log_avaialable_labels():
        log.info("Available classifier labels:")
        for l in classifier.get_labels():
            log.info("- %s", l)

    if label is None:
        log_avaialable_labels()
        return
    elif label not in classifier.get_labels():
        log.error(
            "Invalid classification label provided to compute and filter "
            "on: '%s'", label)
        log_avaialable_labels()
        return

    log.info("Collecting files from globs")
    #: :type: list[DataFileElement]
    data_elements = []
    uuid2filepath = {}
    for g in file_globs:
        if os.path.isfile(g):
            d = DataFileElement(g)
            data_elements.append(d)
            uuid2filepath[d.uuid()] = g
        else:
            log.debug("expanding glob: %s", g)
            for fp in glob.iglob(g):
                d = DataFileElement(fp)
                data_elements.append(d)
                uuid2filepath[d.uuid()] = fp
    if not data_elements:
        raise RuntimeError("No files provided for classification.")

    log.info("Computing descriptors")
    descriptor_factory = \
        DescriptorElementFactory.from_config(config['descriptor_factory'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    descriptor_generator = \
        plugin.from_plugin_config(config['descriptor_generator'],
                                  get_descriptor_generator_impls())
    descr_map = descriptor_generator\
        .compute_descriptor_async(data_elements, descriptor_factory)

    log.info("Classifying descriptors")
    classification_factory = ClassificationElementFactory \
        .from_config(config['classification_factory'])
    classification_map = classifier\
        .classify_async(descr_map.values(), classification_factory)

    log.info("Printing input file paths that classified as the given label.")
    # map of UUID to filepath:
    uuid2c = dict((c.uuid, c) for c in classification_map.itervalues())
    for data in data_elements:
        if uuid2c[data.uuid()].max_label() == label:
            print uuid2filepath[data.uuid()]
예제 #33
0
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        self.update_index = json_config['update_descriptor_index']

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory'])

        #: :type: smqtk.representation.DescriptorIndex | None
        self.descr_index = None
        if self.update_index:
            self._log.info("Initializing DescriptorIndex to update")
            #: :type: smqtk.representation.DescriptorIndex | None
            self.descr_index = plugin.from_plugin_config(
                json_config['descriptor_index'], get_descriptor_index_impls())

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(json_config['nn_index'],
                                                  get_nn_index_impls())

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.json_config['descriptor_generator'],
            get_descriptor_generator_impls())

        @self.route("/count", methods=['GET'])
        def count():
            """
            Return the number of elements represented in this index.
            """
            return flask.jsonify(**{
                "count": self.nn_index.count(),
            })

        @self.route("/compute/<path:uri>", methods=["POST"])
        def compute(uri):
            """
            Compute the descriptor for a URI specified data element using the
            configured descriptor generator.

            See ``compute_nearest_neighbors`` method docstring for URI
            specifications accepted.

            If the a descriptor index was configured and update was turned on,
            we add the computed descriptor to the index.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :param uri: URI data specification.

            """
            descriptor = None
            try:
                descriptor = self.generate_descriptor_for_uri(uri)
                message = "Descriptor generated"
                descriptor = list(map(float, descriptor.vector()))
            except ValueError as ex:
                message = "Input value issue: %s" % str(ex)
            except RuntimeError as ex:
                message = "Descriptor extraction failure: %s" % str(ex)

            return flask.jsonify(
                success=descriptor is not None,
                message=message,
                descriptor=descriptor,
                reference_uri=uri,
            )

        @self.route("/nn/<path:uri>")
        @self.route("/nn/n=<int:n>/<path:uri>")
        @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>")
        def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None):
            """
            Data modes for upload/use:

                - local filepath
                - base64
                - http/s URL
                - existing data/descriptor UUID

            The following sub-sections detail how different URI's can be used.

            Local Filepath
            --------------
            The URI string must be prefixed with ``file://``, followed by the
            full path to the data file to describe.

            Base 64 data
            ------------
            The URI string must be prefixed with "base64://", followed by the
            base64 encoded string. This mode also requires an additional
            ``?content_type=`` to provide data content type information. This
            mode saves the encoded data to temporary file for processing.

            HTTP/S address
            --------------
            This is the default mode when the URI prefix is none of the above.
            This uses the requests module to locally download a data file
            for processing.

            Existing Data/Descriptor by UUID
            --------------------------------
            When given a uri prefixed with "uuid://", we interpret the remainder
            of the uri as the UUID of a descriptor already present in the
            configured descriptor index. If the given UUID is not present in the
            index, a KeyError is raised.

            JSON Return format
            ------------------
                {
                    "success": <bool>

                    "message": <str>

                    "neighbors": <None|list[float]>

                    "reference_uri": <str>
                }

            :param n: Number of neighbors to query for
            :param start_i: The starting index of the neighbor vectors to slice
                into for return.
            :param end_i: The ending index of the neighbor vectors to slice
                into for return.
            :type uri: str

            """
            descriptor = None
            try:
                descriptor = self.generate_descriptor_for_uri(uri)
                message = "descriptor computed"
            except ValueError as ex:
                message = "Input data issue: %s" % str(ex)
            except RuntimeError as ex:
                message = "Descriptor generation failure: %s" % str(ex)

            # Base pagination slicing based on provided start and end indices,
            # otherwise clamp to beginning/ending of queried neighbor sequence.
            page_slice = slice(start_i or 0, end_i or n)
            neighbors = []
            dists = []
            if descriptor is not None:
                try:
                    neighbors, dists = \
                        self.nn_index.nn(descriptor, n)
                except ValueError as ex:
                    message = "Descriptor or index related issue: %s" % str(ex)

            # TODO: Return the optional descriptor vectors for the neighbors
            # noinspection PyTypeChecker
            d = {
                "success": bool(descriptor is not None),
                "message": message,
                "neighbors": [n.uuid() for n in neighbors[page_slice]],
                "distances": dists[page_slice],
                "reference_uri": uri
            }
            return flask.jsonify(d)
예제 #34
0
파일: __init__.py 프로젝트: Kitware/SMQTK
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        self.update_index = json_config['update_descriptor_index']

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory']
        )

        #: :type: smqtk.representation.DescriptorIndex | None
        self.descr_index = None
        if self.update_index:
            self._log.info("Initializing DescriptorIndex to update")
            #: :type: smqtk.representation.DescriptorIndex | None
            self.descr_index = plugin.from_plugin_config(
                json_config['descriptor_index'],
                get_descriptor_index_impls()
            )

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(
            json_config['nn_index'],
            get_nn_index_impls()
        )

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.json_config['descriptor_generator'],
            get_descriptor_generator_impls()
        )

        @self.route("/count", methods=['GET'])
        def count():
            """
            Return the number of elements represented in this index.
            """
            return flask.jsonify(**{
                "count": self.nn_index.count(),
            })

        @self.route("/compute/<path:uri>", methods=["POST"])
        def compute(uri):
            """
            Compute the descriptor for a URI specified data element using the
            configured descriptor generator.

            See ``compute_nearest_neighbors`` method docstring for URI
            specifications accepted.

            If the a descriptor index was configured and update was turned on,
            we add the computed descriptor to the index.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :param uri: URI data specification.

            """
            descriptor = None
            try:
                descriptor = self.generate_descriptor_for_uri(uri)
                message = "Descriptor generated"
                descriptor = list(map(float, descriptor.vector()))
            except ValueError as ex:
                message = "Input value issue: %s" % str(ex)
            except RuntimeError as ex:
                message = "Descriptor extraction failure: %s" % str(ex)

            return flask.jsonify(
                success=descriptor is not None,
                message=message,
                descriptor=descriptor,
                reference_uri=uri,
            )

        @self.route("/nn/<path:uri>")
        @self.route("/nn/n=<int:n>/<path:uri>")
        @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>")
        def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None):
            """
            Data modes for upload/use:

                - local filepath
                - base64
                - http/s URL
                - existing data/descriptor UUID

            The following sub-sections detail how different URI's can be used.

            Local Filepath
            --------------
            The URI string must be prefixed with ``file://``, followed by the
            full path to the data file to describe.

            Base 64 data
            ------------
            The URI string must be prefixed with "base64://", followed by the
            base64 encoded string. This mode also requires an additional
            ``?content_type=`` to provide data content type information. This
            mode saves the encoded data to temporary file for processing.

            HTTP/S address
            --------------
            This is the default mode when the URI prefix is none of the above.
            This uses the requests module to locally download a data file
            for processing.

            Existing Data/Descriptor by UUID
            --------------------------------
            When given a uri prefixed with "uuid://", we interpret the remainder
            of the uri as the UUID of a descriptor already present in the
            configured descriptor index. If the given UUID is not present in the
            index, a KeyError is raised.

            JSON Return format
            ------------------
                {
                    "success": <bool>

                    "message": <str>

                    "neighbors": <None|list[float]>

                    "reference_uri": <str>
                }

            :param n: Number of neighbors to query for
            :param start_i: The starting index of the neighbor vectors to slice
                into for return.
            :param end_i: The ending index of the neighbor vectors to slice
                into for return.
            :type uri: str

            """
            descriptor = None
            try:
                descriptor = self.generate_descriptor_for_uri(uri)
                message = "descriptor computed"
            except ValueError as ex:
                message = "Input data issue: %s" % str(ex)
            except RuntimeError as ex:
                message = "Descriptor generation failure: %s" % str(ex)

            # Base pagination slicing based on provided start and end indices,
            # otherwise clamp to beginning/ending of queried neighbor sequence.
            page_slice = slice(start_i or 0, end_i or n)
            neighbors = []
            dists = []
            if descriptor is not None:
                try:
                    neighbors, dists = \
                        self.nn_index.nn(descriptor, n)
                except ValueError as ex:
                    message = "Descriptor or index related issue: %s" % str(ex)

            # TODO: Return the optional descriptor vectors for the neighbors
            # noinspection PyTypeChecker
            d = {
                "success": bool(descriptor is not None),
                "message": message,
                "neighbors": [n.uuid() for n in neighbors[page_slice]],
                "distances": dists[page_slice],
                "reference_uri": uri
            }
            return flask.jsonify(d)
예제 #35
0
def main():
    parser = cli_parser()
    args = parser.parse_args()

    output_filepath = args.output_filepath
    overwrite = args.overwrite
    verbose = args.verbose

    llevel = logging.DEBUG if verbose else logging.INFO
    bin_utils.initialize_logging(logging.getLogger(), llevel)
    log = logging.getLogger("main")

    # Merge loaded config with default
    config_loaded = False
    config = default_config()
    if args.config:
        if os.path.isfile(args.config):
            with open(args.config, 'r') as f:
                config.update(json.load(f))
            config_loaded = True
        elif not os.path.isfile(args.config):
            log.error("Configuration file path not valid.")
            exit(1)

    bin_utils.output_config(args.output_config, config, log, True)

    # Configuration must have been loaded at this point since we can't normally
    # trust the default.
    if not config_loaded:
        log.error("No configuration provided")
        exit(1)

    if not args.input_file:
        log.error("Failed to provide an input file path")
        exit(1)
    elif not os.path.isfile(args.input_file):
        log.error("Given path does not point to a file.")
        exit(1)

    input_filepath = args.input_file
    data_element = DataFileElement(input_filepath)

    factory = DescriptorElementFactory.from_config(
        config['descriptor_factory'])
    #: :type: smqtk.algorithms.descriptor_generator.DescriptorGenerator
    cd = plugin.from_plugin_config(config['content_descriptor'],
                                   get_descriptor_generator_impls)
    descr_elem = cd.compute_descriptor(data_element, factory, overwrite)
    vec = descr_elem.vector()

    if vec is None:
        log.error("Failed to generate a descriptor vector for the input data!")

    if output_filepath:
        numpy.save(output_filepath, vec)
    else:
        # Construct string, because numpy
        s = []
        # noinspection PyTypeChecker
        for f in vec:
            s.append('%15f' % f)
        print ' '.join(s)