예제 #1
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptor data elements. This in turn builds
        the configured hash index if one is set.

        Subsequent calls to this method should rebuild the index, not add to
        it, or raise an exception to as to protect the current index. Rebuilding
        the LSH index involves clearing the set descriptor index, key-value
        store and, if set, the hash index.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        if self.read_only:
            raise ReadOnlyError("Cannot modify container attributes due to "
                                "being in read-only mode.")

        self._log.debug("Clearing and adding new descriptor elements")
        self.descriptor_index.clear()
        self.descriptor_index.add_many_descriptors(descriptors)

        self._log.debug("Generating hash codes")
        state = [0] * 7
        hash_vectors = collections.deque()
        self.hash2uuids_kvstore.clear()
        for d in self.descriptor_index:
            h = self.lsh_functor.get_hash(d.vector())
            hash_vectors.append(h)

            h_int = bit_vector_to_int_large(h)

            # Get, update and reinsert hash UUID set object
            #: :type: set
            hash_uuid_set = self.hash2uuids_kvstore.get(h_int, set())
            hash_uuid_set.add(d.uuid())
            self.hash2uuids_kvstore.add(h_int, hash_uuid_set)

            report_progress(self._log.debug, state, 1.0)
        state[1] -= 1
        report_progress(self._log.debug, state, 0)

        if self.hash_index is not None:
            self._log.debug("Clearing and building hash index of type %s",
                            type(self.hash_index))
            # a build is supposed to clear previous state.
            self.hash_index.build_index(hash_vectors)
예제 #2
0
def main():
    description = """
    Utility for fetching remotely stored image paths from the JPL Solr index.

    Files will be transferred with their entire containing directories. For
    example, if the file was stored in "/data/things/image.png" remotely, it
    will be transferred locally to "<output_dir>/data/things/image.png".

    Assumptions:
        - JPL MEMEX Solr index key structure
            - `id` == "file:<abs-filepath>"
            - `mainType` is the first component of the MIMETYPE
            - `indexedAt` timestamp
    """
    args, config = bin_utils.utility_main_helper(default_config, description,
                                                 extend_parser)
    log = logging.getLogger(__name__)

    paths_file = args.paths_file
    after_time = args.after_time
    before_time = args.before_time

    #
    # Check dir/file locations
    #
    if paths_file is None:
        raise ValueError("Need a file path to to output transferred file "
                         "paths!")

    file_utils.safe_create_dir(os.path.dirname(paths_file))

    #
    # Start collection
    #
    remote_paths = solr_image_paths(
        config['solr_address'],
        after_time or '*', before_time or '*',
        config['solr_username'], config['solr_password'],
        config['batch_size']
    )

    log.info("Writing file paths")
    s = [0] * 7
    with open(paths_file, 'w') as of:
        for rp in remote_paths:
            of.write(rp + '\n')
            bin_utils.report_progress(log.info, s, 1.)
    # Final report
    s[1] -= 1
    bin_utils.report_progress(log.info, s, 0)
예제 #3
0
def main():
    description = """
    Utility for fetching remotely stored image paths from the JPL Solr index.

    Files will be transferred with their entire containing directories. For
    example, if the file was stored in "/data/things/image.png" remotely, it
    will be transferred locally to "<output_dir>/data/things/image.png".

    Assumptions:
        - JPL MEMEX Solr index key structure
            - `id` == "file:<abs-filepath>"
            - `mainType` is the first component of the MIMETYPE
            - `indexedAt` timestamp
    """
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    paths_file = args.paths_file
    after_time = args.after_time
    before_time = args.before_time

    #
    # Check dir/file locations
    #
    if paths_file is None:
        raise ValueError("Need a file path to to output transferred file "
                         "paths!")

    file_utils.safe_create_dir(os.path.dirname(paths_file))

    #
    # Start collection
    #
    remote_paths = solr_image_paths(
        config['solr_address'],
        after_time or '*', before_time or '*',
        config['solr_username'], config['solr_password'],
        config['batch_size']
    )

    log.info("Writing file paths")
    s = [0] * 7
    with open(paths_file, 'w') as of:
        for rp in remote_paths:
            of.write(rp + '\n')
            bin_utils.report_progress(log.info, s, 1.)
    # Final report
    s[1] -= 1
    bin_utils.report_progress(log.info, s, 0)
예제 #4
0
    def fit(self, descriptors, use_multiprocessing=True):
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param use_multiprocessing: If multiprocessing should be used, as
            opposed to threading, for collecting descriptor vectors from the
            provided iterable.
        :type use_multiprocessing: bool

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.get_logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not hasattr(descriptors, "__len__"):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            rs = [0]*7
            for d in descriptors:
                descriptors_l.append(d)
                report_progress(self._log.debug, rs, dbg_report_interval)
            descriptors = descriptors_l
        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(
            descriptors, report_interval=dbg_report_interval,
            use_multiprocessing=use_multiprocessing)
        self._log.debug("descriptor matrix shape: %s", x.shape)
        n, dim = x.shape

        self._log.debug("Generating random projections")
        np.random.seed(self.random_seed)
        self.rps = np.random.randn(dim, self.bit_length)

        self._log.debug("Info normalizing descriptors with norm type: %s",
                        self.normalize)
        return self.get_hash(x)
예제 #5
0
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    api_root = config['tool']['girder_api_root']
    api_key = config['tool']['api_key']
    api_query_batch = config['tool']['api_query_batch']
    insert_batch_size = config['tool']['dataset_insert_batch_size']

    # Collect N folder/item/file references on CL and any files referenced.
    #: :type: list[str]
    ids_folder = args.folder
    #: :type: list[str]
    ids_item = args.item
    #: :type: list[str]
    ids_file = args.file

    if args.folder_list:
        with open(args.folder_list) as f:
            ids_folder.extend([fid.strip() for fid in f])
    if args.item_list:
        with open(args.item_list) as f:
            ids_item.extend([iid.strip() for iid in f])
    if args.file_list:
        with open(args.file_list) as f:
            ids_file.extend([fid.strip() for fid in f])

    #: :type: smqtk.representation.DataSet
    data_set = plugin.from_plugin_config(config['plugins']['data_set'],
                                         get_data_set_impls())

    batch = collections.deque()
    rps = [0] * 7
    for e in find_girder_files(api_root, ids_folder, ids_item, ids_file,
                               api_key, api_query_batch):
        batch.append(e)
        if insert_batch_size and len(batch) >= insert_batch_size:
            data_set.add_data(*batch)
            batch.clear()
        bin_utils.report_progress(log.info, rps, 1.0)

    if batch:
        data_set.add_data(*batch)

    log.info('Done')
예제 #6
0
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    api_root = config['tool']['girder_api_root']
    api_key = config['tool']['api_key']
    api_query_batch = config['tool']['api_query_batch']
    insert_batch_size = config['tool']['dataset_insert_batch_size']

    # Collect N folder/item/file references on CL and any files referenced.
    #: :type: list[str]
    ids_folder = args.folder
    #: :type: list[str]
    ids_item = args.item
    #: :type: list[str]
    ids_file = args.file

    if args.folder_list:
        with open(args.folder_list) as f:
            ids_folder.extend([fid.strip() for fid in f])
    if args.item_list:
        with open(args.item_list) as f:
            ids_item.extend([iid.strip() for iid in f])
    if args.file_list:
        with open(args.file_list) as f:
            ids_file.extend([fid.strip() for fid in f])

    #: :type: smqtk.representation.DataSet
    data_set = plugin.from_plugin_config(config['plugins']['data_set'],
                                         get_data_set_impls())

    batch = collections.deque()
    rps = [0]*7
    for e in find_girder_files(api_root, ids_folder, ids_item, ids_file,
                               api_key, api_query_batch):
        batch.append(e)
        if insert_batch_size and len(batch) >= insert_batch_size:
            data_set.add_data(*batch)
            batch.clear()
        bin_utils.report_progress(log.info, rps, 1.0)

    if batch:
        data_set.add_data(*batch)

    log.info('Done')
예제 #7
0
def main():
    args = cli_parser().parse_args()

    initialize_logging(logging.getLogger('smqtk'), logging.DEBUG)
    initialize_logging(logging.getLogger('__main__'), logging.DEBUG)
    log = logging.getLogger(__name__)

    hash2uuids_fp = os.path.abspath(args.hash2uuids_fp)
    bit_len = args.bit_len
    leaf_size = args.leaf_size
    rand_seed = args.rand_seed
    balltree_model_fp = os.path.abspath(args.balltree_model_fp)

    assert os.path.isfile(hash2uuids_fp), "Bad path: '%s'" % hash2uuids_fp
    assert os.path.isdir(os.path.dirname(balltree_model_fp)), \
        "Bad path: %s" % balltree_model_fp

    log.debug("hash2uuids_fp    : %s", hash2uuids_fp)
    log.debug("bit_len          : %d", bit_len)
    log.debug("leaf_size        : %d", leaf_size)
    log.debug("rand_seed        : %d", rand_seed)
    log.debug("balltree_model_fp: %s", balltree_model_fp)


    log.info("Loading hash2uuids table")
    with open(hash2uuids_fp) as f:
        hash2uuids = cPickle.load(f)

    log.info("Computing hash-code vectors")
    hash_vectors = []  #[int_to_bit_vector_large(h, bit_len) for h in hash2uuids]
    rs = [0] * 7
    for h in hash2uuids:
        hash_vectors.append( int_to_bit_vector_large(h, bit_len) )
        report_progress(log.debug, rs, 1.)

    log.info("Initializing ball tree")
    btree = SkLearnBallTreeHashIndex(balltree_model_fp, leaf_size, rand_seed)

    log.info("Building ball tree")
    btree.build_index(hash_vectors)
예제 #8
0
    def fit(self, descriptors):
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not hasattr(descriptors, "__len__"):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            rs = [0] * 7
            for d in descriptors:
                descriptors_l.append(d)
                report_progress(self._log.debug, rs, dbg_report_interval)
            descriptors = descriptors_l
        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors,
                               report_interval=dbg_report_interval)
        self._log.debug("descriptor matrix shape: %s", x.shape)

        self._log.debug("Info normalizing descriptors by factor: %s",
                        self.normalize)
        x = self._norm_vector(x)

        self._log.info("Centering data")
        self.mean_vec = numpy.mean(x, axis=0)
        x -= self.mean_vec

        self._log.info("Computing PCA transformation")
        # numpy and matlab observation format is flipped, thus the added
        # transpose.
        self._log.debug("-- computing covariance")
        c = numpy.cov(x.transpose())

        # Direct translation from UNC matlab code
        # - eigen vectors are the columns of ``pc``
        self._log.debug('-- computing linalg.eig')
        l, pc = numpy.linalg.eig(c)
        # ordered by greatest eigenvalue magnitude, keeping top ``bit_len``
        self._log.debug('-- computing top pairs')
        top_pairs = sorted(zip(l, pc.transpose()),
                           key=lambda p: p[0],
                           reverse=1)[:self.bit_length]

        # # Harry translation -- Uses singular values / vectors, not eigen
        # # - singular vectors are the rows of pc
        # pc, l, _ = numpy.linalg.svd(c)
        # top_pairs = sorted(zip(l, pc),
        #                    key=lambda p: p[0],
        #                    reverse=1
        #                    )[:self.bit_length]

        # Eigen-vectors of top ``bit_len`` magnitude eigenvalues
        self._log.debug("-- top vector extraction")
        pc_top = numpy.array([p[1] for p in top_pairs]).transpose()
        self._log.debug("-- transform centered data by PC matrix")
        xx = numpy.dot(x, pc_top)

        self._log.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(xx, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = numpy.dot(pc_top, self.rotation)

        self.save_model()

        return c
예제 #9
0
def mb_kmeans_build_apply(index, mbkm, initial_fit_size):
    """
    Build the MiniBatchKMeans centroids based on the descriptors in the given
    index, then predicting descriptor clusters with the final result model.

    If the given index is empty, no fitting or clustering occurs and an empty
    dictionary is returned.

    :param index: Index of descriptors
    :type index: smqtk.representation.DescriptorIndex

    :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for
        prediction
    :type mbkm: sklearn.cluster.MiniBatchKMeans

    :param initial_fit_size: Number of descriptors to run an initial fit with.
        This brings the advantage of choosing a best initialization point from
        multiple.
    :type initial_fit_size: int

    :return: Dictionary of the cluster label (integer) to the set of descriptor
        UUIDs belonging to that cluster.
    :rtype: dict[int, set[collections.Hashable]]

    """
    log = logging.getLogger(__name__)

    ifit_completed = False
    k_deque = collections.deque()
    d_fitted = 0

    log.info("Getting index keys (shuffled)")
    index_keys = sorted(six.iterkeys(index))
    numpy.random.seed(mbkm.random_state)
    numpy.random.shuffle(index_keys)

    def parallel_iter_vectors(descriptors):
        """ Get the vectors for the descriptors given.
        Not caring about order returned.
        """
        return parallel.parallel_map(lambda d: d.vector(), descriptors,
                                     use_multiprocessing=False)

    def get_vectors(k_iter):
        """ Get numpy array of descriptor vectors (2D array returned) """
        return numpy.array(list(
            parallel_iter_vectors(index.get_many_descriptors(k_iter))
        ))

    log.info("Collecting iteratively fitting model")
    rps = [0] * 7
    for i, k in enumerate(index_keys):
        k_deque.append(k)
        bin_utils.report_progress(log.debug, rps, 1.)

        if initial_fit_size and not ifit_completed:
            if len(k_deque) == initial_fit_size:
                log.info("Initial fit using %d descriptors", len(k_deque))
                log.info("- collecting vectors")
                vectors = get_vectors(k_deque)
                log.info("- fitting model")
                mbkm.fit(vectors)
                log.info("- cleaning")
                d_fitted += len(vectors)
                k_deque.clear()
                ifit_completed = True
        elif len(k_deque) == mbkm.batch_size:
            log.info("Partial fit with batch size %d", len(k_deque))
            log.info("- collecting vectors")
            vectors = get_vectors(k_deque)
            log.info("- fitting model")
            mbkm.partial_fit(vectors)
            log.info("- cleaning")
            d_fitted += len(k_deque)
            k_deque.clear()

    # Final fit with any remaining descriptors
    if k_deque:
        log.info("Final partial fit of size %d", len(k_deque))
        log.info('- collecting vectors')
        vectors = get_vectors(k_deque)
        log.info('- fitting model')
        mbkm.partial_fit(vectors)
        log.info('- cleaning')
        d_fitted += len(k_deque)
        k_deque.clear()

    log.info("Computing descriptor classes with final KMeans model")
    mbkm.verbose = False
    d_classes = collections.defaultdict(set)
    d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()),
                                      index,
                                      use_multiprocessing=False,
                                      name="uv-collector")
    # TODO: Batch predict call inputs to something larger than one at a time.
    d_uc_iter = parallel.parallel_map(
        lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]),
        d_uv_iter,
        use_multiprocessing=False,
        name="uc-collector")
    rps = [0] * 7
    for uuid, c in d_uc_iter:
        d_classes[c].add(uuid)
        bin_utils.report_progress(log.debug, rps, 1.)
    rps[1] -= 1
    bin_utils.report_progress(log.debug, rps, 0)

    return d_classes
예제 #10
0
def mb_kmeans_build_apply(index, mbkm, initial_fit_size):
    """
    Build the MiniBatchKMeans centroids based on the descriptors in the given
    index, then predicting descriptor clusters with the final result model.

    If the given index is empty, no fitting or clustering occurs and an empty
    dictionary is returned.

    :param index: Index of descriptors
    :type index: smqtk.representation.DescriptorIndex

    :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for
        prediction
    :type mbkm: sklearn.cluster.MiniBatchKMeans

    :param initial_fit_size: Number of descriptors to run an initial fit with.
        This brings the advantage of choosing a best initialization point from
        multiple.
    :type initial_fit_size: int

    :return: Dictionary of the cluster label (integer) to the set of descriptor
        UUIDs belonging to that cluster.
    :rtype: dict[int, set[collections.Hashable]]

    """
    log = logging.getLogger(__name__)

    ifit_completed = False
    k_deque = collections.deque()
    d_fitted = 0

    log.info("Getting index keys (shuffled)")
    index_keys = sorted(six.iterkeys(index))
    numpy.random.seed(mbkm.random_state)
    numpy.random.shuffle(index_keys)

    def parallel_iter_vectors(descriptors):
        """ Get the vectors for the descriptors given.
        Not caring about order returned.
        """
        return parallel.parallel_map(lambda d: d.vector(), descriptors,
                                     use_multiprocessing=False)

    def get_vectors(k_iter):
        """ Get numpy array of descriptor vectors (2D array returned) """
        return numpy.array(list(
            parallel_iter_vectors(index.get_many_descriptors(k_iter))
        ))

    log.info("Collecting iteratively fitting model")
    rps = [0] * 7
    for i, k in enumerate(index_keys):
        k_deque.append(k)
        bin_utils.report_progress(log.debug, rps, 1.)

        if initial_fit_size and not ifit_completed:
            if len(k_deque) == initial_fit_size:
                log.info("Initial fit using %d descriptors", len(k_deque))
                log.info("- collecting vectors")
                vectors = get_vectors(k_deque)
                log.info("- fitting model")
                mbkm.fit(vectors)
                log.info("- cleaning")
                d_fitted += len(vectors)
                k_deque.clear()
                ifit_completed = True
        elif len(k_deque) == mbkm.batch_size:
            log.info("Partial fit with batch size %d", len(k_deque))
            log.info("- collecting vectors")
            vectors = get_vectors(k_deque)
            log.info("- fitting model")
            mbkm.partial_fit(vectors)
            log.info("- cleaning")
            d_fitted += len(k_deque)
            k_deque.clear()

    # Final fit with any remaining descriptors
    if k_deque:
       log.info("Final partial fit of size %d", len(k_deque))
       log.info('- collecting vectors')
       vectors = get_vectors(k_deque)
       log.info('- fitting model')
       mbkm.partial_fit(vectors)
       log.info('- cleaning')
       d_fitted += len(k_deque)
       k_deque.clear()

    log.info("Computing descriptor classes with final KMeans model")
    mbkm.verbose = False
    d_classes = collections.defaultdict(set)
    d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()),
                                      index,
                                      use_multiprocessing=False,
                                      name="uv-collector")
    # TODO: Batch predict call inputs to something larger than one at a time.
    d_uc_iter = parallel.parallel_map(
        lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]),
        d_uv_iter,
        use_multiprocessing=False,
        name="uc-collector")
    rps = [0] * 7
    for uuid, c in d_uc_iter:
        d_classes[c].add(uuid)
        bin_utils.report_progress(log.debug, rps, 1.)
    rps[1] -= 1
    bin_utils.report_progress(log.debug, rps, 0)

    return d_classes
예제 #11
0
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    # - parallel_map UUIDs to load from the configured index
    # - classify iterated descriptors

    uuids_list_filepath = args.uuids_list
    output_csv_filepath = args.csv_data
    output_csv_header_filepath = args.csv_header
    classify_overwrite = config['utility']['classify_overwrite']

    p_use_multiprocessing = \
        config['utility']['parallel']['use_multiprocessing']
    p_index_extraction_cores = \
        config['utility']['parallel']['index_extraction_cores']
    p_classification_cores = \
        config['utility']['parallel']['classification_cores']

    if not uuids_list_filepath:
        raise ValueError("No uuids_list_filepath specified.")
    elif not os.path.isfile(uuids_list_filepath):
        raise ValueError("Given uuids_list_filepath did not point to a file.")
    if output_csv_header_filepath is None:
        raise ValueError("Need a path to save CSV header labels")
    if output_csv_filepath is None:
        raise ValueError("Need a path to save CSV data.")

    #
    # Initialize configured plugins
    #

    log.info("Initializing descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'], get_descriptor_index_impls())

    log.info("Initializing classification factory")
    c_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory'])

    log.info("Initializing classifier")
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(config['plugins']['classifier'],
                                           get_classifier_impls())

    #
    # Setup/Process
    #
    def iter_uuids():
        with open(uuids_list_filepath) as f:
            for l in f:
                yield l.strip()

    def descr_for_uuid(uuid):
        """
        :type uuid: collections.Hashable
        :rtype: smqtk.representation.DescriptorElement
        """
        return descriptor_index.get_descriptor(uuid)

    def classify_descr(d):
        """
        :type d: smqtk.representation.DescriptorElement
        :rtype: smqtk.representation.ClassificationElement
        """
        return classifier.classify(d, c_factory, classify_overwrite)

    log.info("Initializing uuid-to-descriptor parallel map")
    #: :type: collections.Iterable[smqtk.representation.DescriptorElement]
    element_iter = parallel.parallel_map(
        descr_for_uuid,
        iter_uuids(),
        use_multiprocessing=p_use_multiprocessing,
        cores=p_index_extraction_cores,
        name="descr_for_uuid",
    )

    log.info("Initializing descriptor-to-classification parallel map")
    #: :type: collections.Iterable[smqtk.representation.ClassificationElement]
    classification_iter = parallel.parallel_map(
        classify_descr,
        element_iter,
        use_multiprocessing=p_use_multiprocessing,
        cores=p_classification_cores,
        name='classify_descr',
    )

    #
    # Write/Output files
    #

    c_labels = classifier.get_labels()

    def make_row(c):
        """
        :type c: smqtk.representation.ClassificationElement
        """
        c_m = c.get_classification()
        return [c.uuid] + [c_m[l] for l in c_labels]

    # column labels file
    log.info("Writing CSV column header file: %s", output_csv_header_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath))
    with open(output_csv_header_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        w.writerow(['uuid'] + c_labels)

    # CSV file
    log.info("Writing CSV data file: %s", output_csv_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_filepath))
    r_state = [0] * 7
    with open(output_csv_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        for c in classification_iter:
            w.writerow(make_row(c))
            bin_utils.report_progress(log.info, r_state, 1.0)

    # Final report
    r_state[1] -= 1
    bin_utils.report_progress(log.info, r_state, 0)

    log.info("Done")
예제 #12
0
def run_file_list(c,
                  filelist_filepath,
                  checkpoint_filepath,
                  batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    :param check_image: Enable checking image loading from file before queueing
        that file for processing. If the check fails, the file is skipped
        instead of a halting exception being raised.
    :type check_image: bool

    """
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(c['descriptor_index'],
                                                 get_descriptor_index_impls())

    data_set = None
    if c['optional_data_set']['type'] is None:
        log.info("Not saving loaded data elements to data set")
    else:
        log.info("Initializing data set to append to")
        #: :type: smqtk.representation.DataSet
        data_set = plugin.from_plugin_config(c['optional_data_set'],
                                             get_data_set_impls())

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = plugin.from_plugin_config(c['descriptor_generator'],
                                          get_descriptor_generator_impls())

    def iter_valid_elements():
        def is_valid(file_path):
            dfe = DataFileElement(file_path)

            if is_valid_element(
                    dfe,
                    valid_content_types=generator.valid_content_types(),
                    check_image=check_image):
                return dfe
            else:
                return False

        data_elements = collections.deque()
        valid_files_filter = parallel.parallel_map(is_valid,
                                                   file_paths,
                                                   name="check-file-type",
                                                   use_multiprocessing=True)
        for dfe in valid_files_filter:
            if dfe:
                yield dfe
                if data_set is not None:
                    data_elements.append(dfe)
                    if batch_size and len(data_elements) == batch_size:
                        log.debug(
                            "Adding data element batch to set (size: %d)",
                            len(data_elements))
                        data_set.add_data(*data_elements)
                        data_elements.clear()
        # elements only collected if we have a data-set configured, so add any
        # still in the deque to the set
        if data_elements:
            log.debug("Adding data elements to set (size: %d",
                      len(data_elements))
            data_set.add_data(*data_elements)

    log.info("Computing descriptors")
    m = compute_many_descriptors(
        iter_valid_elements(),
        generator,
        factory,
        descriptor_index,
        batch_size=batch_size,
    )

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, 'w')
    cf_writer = csv.writer(cf)
    try:
        rps = [0] * 7
        for fp, descr in m:
            cf_writer.writerow([fp, descr.uuid()])
            report_progress(log.debug, rps, 1.)
    finally:
        del cf_writer
        cf.close()

    log.info("Done")
예제 #13
0
파일: itq.py 프로젝트: dhandeo/SMQTK
    def fit(self, descriptors):
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not hasattr(descriptors, "__len__"):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            rs = [0]*7
            for d in descriptors:
                descriptors_l.append(d)
                report_progress(self._log.debug, rs, dbg_report_interval)
            descriptors = descriptors_l
        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors, report_interval=dbg_report_interval)
        self._log.debug("descriptor matrix shape: %s", x.shape)

        self._log.debug("Info normalizing descriptors by factor: %s",
                        self.normalize)
        x = self._norm_vector(x)

        self._log.info("Centering data")
        self.mean_vec = numpy.mean(x, axis=0)
        x -= self.mean_vec

        self._log.info("Computing PCA transformation")
        # numpy and matlab observation format is flipped, thus the added
        # transpose.
        self._log.debug("-- computing covariance")
        c = numpy.cov(x.transpose())

        # Direct translation from UNC matlab code
        # - eigen vectors are the columns of ``pc``
        self._log.debug('-- computing linalg.eig')
        l, pc = numpy.linalg.eig(c)
        # ordered by greatest eigenvalue magnitude, keeping top ``bit_len``
        self._log.debug('-- computing top pairs')
        top_pairs = sorted(zip(l, pc.transpose()),
                           key=lambda p: p[0],
                           reverse=1
                           )[:self.bit_length]

        # # Harry translation -- Uses singular values / vectors, not eigen
        # # - singular vectors are the rows of pc
        # pc, l, _ = numpy.linalg.svd(c)
        # top_pairs = sorted(zip(l, pc),
        #                    key=lambda p: p[0],
        #                    reverse=1
        #                    )[:self.bit_length]

        # Eigen-vectors of top ``bit_len`` magnitude eigenvalues
        self._log.debug("-- top vector extraction")
        pc_top = numpy.array([p[1] for p in top_pairs]).transpose()
        self._log.debug("-- transform centered data by PC matrix")
        xx = numpy.dot(x, pc_top)

        self._log.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(xx, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = numpy.dot(pc_top, self.rotation)

        self.save_model()

        return c
예제 #14
0
    def compute_descriptor_async(self,
                                 data_iter,
                                 descr_factory=DFLT_DESCRIPTOR_FACTORY,
                                 overwrite=False,
                                 procs=None,
                                 **kwds):
        """
        Asynchronously compute feature data for multiple data items.

        :param data_iter: Iterable of data elements to compute features for.
            These must have UIDs assigned for feature association in return
            value.
        :type data_iter: collections.Iterable[smqtk.representation.DataElement]

        :param descr_factory: Factory instance to produce the wrapping
            descriptor element instance. The default factory produces
            ``DescriptorMemoryElement`` instances by default.
        :type descr_factory: smqtk.representation.DescriptorElementFactory

        :param overwrite: Whether or not to force re-computation of a descriptor
            vectors for the given data even when there exists precomputed
            vectors in the generated DescriptorElements as generated from the
            provided factory. This will overwrite the persistently stored
            vectors if the provided factory produces a DescriptorElement
            implementation such storage.
        :type overwrite: bool

        :param procs: Optional specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type procs: int | None

        :raises ValueError: An input DataElement was of a content type that we
            cannot handle.

        :return: Mapping of input DataElement UUIDs to the computed descriptor
            element for that data. DescriptorElement UUID's are congruent with
            the UUID of the data element it is the descriptor of.
        :rtype: dict[collections.Hashable,
                     smqtk.representation.DescriptorElement]

        """
        self._set_caffe_mode()

        # Create DescriptorElement instances for each data elem.
        #: :type: dict[collections.Hashable, smqtk.representation.DataElement]
        data_elements = {}
        #: :type: dict[collections.Hashable, smqtk.representation.DescriptorElement]
        descr_elements = {}
        self._log.debug("Checking content types; aggregating data/descriptor "
                        "elements.")
        prog_rep_state = [0] * 7
        for data in data_iter:
            ct = data.content_type()
            if ct not in self.valid_content_types():
                self._log.error("Cannot compute descriptor from content type "
                                "'%s' data: %s)" % (ct, data))
                raise ValueError("Cannot compute descriptor from content type "
                                 "'%s' data: %s)" % (ct, data))
            data_elements[data.uuid()] = data
            descr_elements[data.uuid()] = \
                descr_factory.new_descriptor(self.name, data.uuid())
            report_progress(self._log.debug, prog_rep_state, 1.0)
        self._log.debug("Given %d unique data elements", len(data_elements))

        # Reduce procs down to the number of elements to process if its smaller
        if len(data_elements) < (procs or multiprocessing.cpu_count()):
            procs = len(data_elements)
        if procs == 0:
            raise ValueError("No data elements provided")

        # For thread safely, only use .append() and .popleft() (queue)
        uuid4proc = deque()

        def check_get_uuid(descriptor_elem):
            if overwrite or not descriptor_elem.has_vector():
                # noinspection PyUnresolvedReferences
                uuid4proc.append(descriptor_elem.uuid())

        # Using thread-pool due to in-line function + updating local deque
        p = multiprocessing.pool.ThreadPool(procs)
        try:
            p.map(check_get_uuid, six.itervalues(descr_elements))
        finally:
            p.close()
            p.join()
        del p
        self._log.debug("%d descriptors already computed",
                        len(data_elements) - len(uuid4proc))

        if uuid4proc:
            self._log.debug("Converting deque to tuple for segmentation")
            uuid4proc = tuple(uuid4proc)

            # Split UUIDs into groups equal to our batch size, and an option
            # tail group that is less than our batch size.
            tail_size = len(uuid4proc) % self.batch_size
            batch_groups = (len(uuid4proc) - tail_size) // self.batch_size
            self._log.debug("Processing %d batches of size %d", batch_groups,
                            self.batch_size)
            if tail_size:
                self._log.debug("Processing tail group of size %d", tail_size)

            if batch_groups:
                for g in range(batch_groups):
                    self._log.debug("Starting batch: %d of %d", g + 1,
                                    batch_groups)
                    batch_uuids = \
                        uuid4proc[g * self.batch_size:(g + 1) * self.batch_size]
                    self._process_batch(batch_uuids, data_elements,
                                        descr_elements, procs,
                                        kwds.get('use_mp', True))

            if tail_size:
                batch_uuids = uuid4proc[-tail_size:]
                self._log.debug("Starting tail batch (size=%d)",
                                len(batch_uuids))
                self._process_batch(batch_uuids, data_elements, descr_elements,
                                    procs, kwds.get('use_mp', True))

        self._log.debug("forming output dict")
        return dict((data_elements[k].uuid(), descr_elements[k])
                    for k in data_elements)
예제 #15
0
        dl_image, iter_scan_meta(),
        name='image_download',
        use_multiprocessing=True,
        cores=cores
    )

    # Write out
    log.info("Starting iteration/file-write")
    rp_state = [0] * 7
    with open(scan_record, 'w') as record_file:
        for r in img_dl_records:
            if r is not None:
                cdr_id, local_path, uuid = r
                record_file.write('%s,%s,%s\n'
                                  % (cdr_id, local_path, uuid))
            report_progress(log.debug, rp_state, 1.0)
        # Final report
        rp_state[1] -= 1
        report_progress(log.debug, rp_state, 0)


def default_config():
    return {
        "image_types": ['jpeg', 'png', 'tiff'],
        "elastic_search": {
            "instance_address": "CHANGEME",
            "index": "CHANGEME",
            "username": "******",
            "password": "******",
            "batch_size": 10000,
        },
예제 #16
0
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True):
    """
    Method for computing the distance kernel of an array of vectors given a
    distance function that works on two supplied 1D arrays.

    For a valid distance function interface, see
    ``smqtk.utils.distance_functions.histogram_intersection_distance2``.

    :param m: An array of vectors to compute the pairwise distance kernel for.
    :type m: numpy.core.multiarray.ndarray

    :param dist_func: Distance function
    :type dist_func: (ndarray, ndarray) -> ndarray[float] | float

    :param row_wise: If the given distance function can take a vector and a
        matrix, and computes pair-wise distances, returning a vector of
        distances between the given vector and each row of the matrix.
    :type row_wise: bool

    :param parallel: If distances should be calculated in parallel. This is true
        by default.
    :type parallel: bool

    :return: Computed symmetric distance kernel
    :rtype: numpy.core.multiarray.ndarray

    """
    if hasattr(dist_func, 'im_func'):
        # noinspection PyUnresolvedReferences
        distance_name = '.'.join([
            dist_func.__module__, dist_func.im_class.__name__,
            dist_func.im_func.func_name
        ])
    elif hasattr(dist_func, 'func_name'):
        # noinspection PyUnresolvedReferences
        distance_name = '.'.join([dist_func.__module__, dist_func.func_name])
    elif hasattr(dist_func, 'py_func') \
            and hasattr(dist_func.py_func, 'func_name'):
        distance_name = '.'.join(
            [dist_func.__module__, dist_func.py_func.func_name])
    else:
        distance_name = "<unknown>"
    log = logging.getLogger('compute_distance_kernel[%s]' % distance_name)

    if m.ndim == 1:
        m = m[np.newaxis]

    log.info("Computing distance kernel")
    side = m.shape[0]
    mat = np.ndarray((side, side), dtype=float)

    if row_wise:
        log.debug("Computing row-wise distances")
        # For all rows except the last one. We'll have computed all distanced by
        # the time reach m[side-1]
        if parallel:

            def work_func(i):
                mat[i, i] = 0.
                if i < (side - 1):
                    mat[i + 1:,
                        i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :])

            # Using threading for in-place modification
            s = [0] * 7
            for _ in parallel_map(work_func,
                                  xrange(side),
                                  use_multiprocessing=False):
                report_progress(log.debug, s, 1.)
        else:
            for i in xrange(side):
                # Compute col/row wise distances
                mat[i, i] = 0.
                if i < (side - 1):
                    mat[i + 1:,
                        i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :])
    else:
        log.debug("Computing element-wise distances")
        if parallel:

            def work_func(i):
                mat[i, i] = 0
                # cols to the left of diagonal index for this row
                for j in xrange(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])

            # Using threading for in-place modification
            s = [0] * 7
            for _ in parallel_map(work_func,
                                  xrange(side),
                                  use_multiprocessing=False):
                report_progress(log.debug, s, 1.)
        else:
            for i in xrange(side):
                mat[i, i] = 0
                # cols to the left of diagonal index for this row
                for j in xrange(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])

    return mat
예제 #17
0
    log.info("Initializing image download/record parallel iterator")
    img_dl_records = parallel_map(dl_image,
                                  iter_scan_meta(),
                                  name='image_download',
                                  use_multiprocessing=True,
                                  cores=cores)

    # Write out
    log.info("Starting iteration/file-write")
    rp_state = [0] * 7
    with open(scan_record, 'w') as record_file:
        for r in img_dl_records:
            if r is not None:
                cdr_id, local_path, uuid = r
                record_file.write('%s,%s,%s\n' % (cdr_id, local_path, uuid))
            report_progress(log.debug, rp_state, 1.0)
        # Final report
        rp_state[1] -= 1
        report_progress(log.debug, rp_state, 0)


def default_config():
    return {
        "image_types": ['jpeg', 'png', 'tiff'],
        "elastic_search": {
            "instance_address": "CHANGEME",
            "index": "CHANGEME",
            "username": "******",
            "password": "******",
            "batch_size": 10000,
        },
예제 #18
0
    def fit(self, descriptors, use_multiprocessing=True):
        """
        Fit the ITQ model given the input set of descriptors.

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.get_logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not isinstance(descriptors, Sequence):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            rs = [0] * 7
            for d in descriptors:
                descriptors_l.append(d)
                report_progress(self._log.debug, rs, dbg_report_interval)
            descriptors = descriptors_l
        if len(descriptors[0].vector()) < self.bit_length:
            raise ValueError("Input descriptors have fewer features than "
                             "requested bit encoding. Hash codes will be "
                             "smaller than requested due to PCA decomposition "
                             "result being bound by number of features.")

        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors,
                               report_interval=dbg_report_interval,
                               use_multiprocessing=use_multiprocessing)
        self._log.debug("descriptor matrix shape: %s", x.shape)

        self._log.debug("Info normalizing descriptors by factor: %s",
                        self.normalize)
        x = self._norm_vector(x)

        self._log.info("Centering data")
        self.mean_vec = numpy.mean(x, axis=0)
        x -= self.mean_vec

        self._log.info("Computing PCA transformation")
        self._log.debug("-- computing covariance")
        # ``cov`` wants each row to be a feature and each column an observation
        # of those features. Thus, each column should be a descriptor vector,
        # thus we need the transpose here.
        c = numpy.cov(x.transpose())

        if True:
            # Direct translation from UNC matlab code
            # - eigen vectors are the columns of ``pc``
            self._log.debug('-- computing linalg.eig')
            l, pc = numpy.linalg.eig(c)
            self._log.debug('-- ordering eigen vectors by descending eigen '
                            'value')
        else:
            # Harry translation -- Uses singular values / vectors, not eigen
            # - singular vectors are the columns of pc
            self._log.debug('-- computing linalg.svd')
            pc, l, _ = numpy.linalg.svd(c)
            self._log.debug('-- ordering singular vectors by descending '
                            'singular value')

        # Same ordering method for both eig/svd sources.
        l_pc_ordered = sorted(zip(l, pc.transpose()),
                              key=lambda p: p[0],
                              reverse=1)

        self._log.debug("-- top vector extraction")
        # Only keep the top ``bit_length`` vectors after ordering by descending
        # value magnitude.
        # - Transposing vectors back to column-vectors.
        pc_top = numpy.array([p[1] for p in l_pc_ordered[:self.bit_length]])\
            .transpose()
        self._log.debug("-- project centered data by PC matrix")
        v = numpy.dot(x, pc_top)

        self._log.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(v, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = numpy.dot(pc_top, self.rotation)

        self.save_model()

        return c
예제 #19
0
def main():
    description = """
    Script for asynchronously computing classifications for DescriptorElements
    in a DescriptorIndex specified via a list of UUIDs. Results are output to a
    CSV file in the format:

        uuid, label1_confidence, label2_confidence, ...

    CSV columns labels are output to the given CSV header file path. Label
    columns will be in the order as reported by the classifier implementations
    ``get_labels`` method.

    Due to using an input file-list of UUIDs, we require that the UUIDs of
    indexed descriptors be strings, or equality comparable to the UUIDs' string
    representation.
    """

    args, config = bin_utils.utility_main_helper(
        default_config,
        description,
        extend_parser,
    )
    log = logging.getLogger(__name__)

    # - parallel_map UUIDs to load from the configured index
    # - classify iterated descriptors

    uuids_list_filepath = args.uuids_list
    output_csv_filepath = args.csv_data
    output_csv_header_filepath = args.csv_header
    classify_overwrite = config['utility']['classify_overwrite']

    p_use_multiprocessing = \
        config['utility']['parallel']['use_multiprocessing']
    p_index_extraction_cores = \
        config['utility']['parallel']['index_extraction_cores']
    p_classification_cores = \
        config['utility']['parallel']['classification_cores']

    if not uuids_list_filepath:
        raise ValueError("No uuids_list_filepath specified.")
    elif not os.path.isfile(uuids_list_filepath):
        raise ValueError("Given uuids_list_filepath did not point to a file.")
    if output_csv_header_filepath is None:
        raise ValueError("Need a path to save CSV header labels")
    if output_csv_filepath is None:
        raise ValueError("Need a path to save CSV data.")

    #
    # Initialize configured plugins
    #

    log.info("Initializing descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'],
        get_descriptor_index_impls()
    )

    log.info("Initializing classification factory")
    c_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory']
    )

    log.info("Initializing classifier")
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(
        config['plugins']['classifier'], get_classifier_impls()
    )

    #
    # Setup/Process
    #
    def iter_uuids():
        with open(uuids_list_filepath) as f:
            for l in f:
                yield l.strip()

    def descr_for_uuid(uuid):
        """
        :type uuid: collections.Hashable
        :rtype: smqtk.representation.DescriptorElement
        """
        return descriptor_index.get_descriptor(uuid)

    def classify_descr(d):
        """
        :type d: smqtk.representation.DescriptorElement
        :rtype: smqtk.representation.ClassificationElement
        """
        return classifier.classify(d, c_factory, classify_overwrite)

    log.info("Initializing uuid-to-descriptor parallel map")
    #: :type: collections.Iterable[smqtk.representation.DescriptorElement]
    element_iter = parallel.parallel_map(
        descr_for_uuid, iter_uuids(),
        use_multiprocessing=p_use_multiprocessing,
        cores=p_index_extraction_cores,
        name="descr_for_uuid",
    )

    log.info("Initializing descriptor-to-classification parallel map")
    #: :type: collections.Iterable[smqtk.representation.ClassificationElement]
    classification_iter = parallel.parallel_map(
        classify_descr, element_iter,
        use_multiprocessing=p_use_multiprocessing,
        cores=p_classification_cores,
        name='classify_descr',
    )

    #
    # Write/Output files
    #

    c_labels = classifier.get_labels()

    def make_row(c):
        """
        :type c: smqtk.representation.ClassificationElement
        """
        c_m = c.get_classification()
        return [c.uuid] + [c_m[l] for l in c_labels]

    # column labels file
    log.info("Writing CSV column header file: %s", output_csv_header_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath))
    with open(output_csv_header_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        w.writerow(['uuid'] + c_labels)

    # CSV file
    log.info("Writing CSV data file: %s", output_csv_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_filepath))
    r_state = [0] * 7
    with open(output_csv_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        for c in classification_iter:
            w.writerow(make_row(c))
            bin_utils.report_progress(log.info, r_state, 1.0)

    # Final report
    r_state[1] -= 1
    bin_utils.report_progress(log.info, r_state, 0)

    log.info("Done")
예제 #20
0
    m = compute_many_descriptors(iter_valid_elements(),
                                 generator,
                                 factory,
                                 descriptor_index,
                                 batch_size=batch_size,
                                 )

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, 'w')
    try:
        rps = [0] * 7
        for fp, descr in m:
            cf.write("{:s},{:s}\n".format(
                fp, descr.uuid()
            ))
            report_progress(log.debug, rps, 1.)
        # Final report
        rps[1] -= 1
        report_progress(log.debug, rps, 0.)
    finally:
        cf.close()

    log.info("Done")


def extend_parser(parser):
    parser.add_argument('-b', '--batch-size',
                        type=int, default=256, metavar='INT',
                        help="Number of files to batch together into a single "
                             "compute async call. This defines the "
                             "granularity of the checkpoint file in regards "
예제 #21
0
    def compute_descriptor_async(self, data_iter,
                                 descr_factory=DFLT_DESCRIPTOR_FACTORY,
                                 overwrite=False, procs=None, **kwds):
        """
        Asynchronously compute feature data for multiple data items.

        :param data_iter: Iterable of data elements to compute features for.
            These must have UIDs assigned for feature association in return
            value.
        :type data_iter: collections.Iterable[smqtk.representation.DataElement]

        :param descr_factory: Factory instance to produce the wrapping
            descriptor element instances. In-Memory descriptor factory by
            default.
        :type descr_factory: smqtk.representation.DescriptorElementFactory

        :param overwrite: Whether or not to force re-computation of a descriptor
            vectors for the given data even when there exists precomputed
            vectors in the generated DescriptorElements as generated from the
            provided factory. This will overwrite the persistently stored
            vectors if the provided factory produces a DescriptorElement
            implementation such storage.
        :type overwrite: bool

        :param procs: Optional specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type procs: int

        :raises ValueError: An input DataElement was of a content type that we
            cannot handle.

        :return: Mapping of input DataElement instances to the computed
            descriptor element.
            DescriptorElement UUID's are congruent with the UUID of the data
            element it is the descriptor of.
        :rtype: dict[smqtk.representation.DataElement,
                     smqtk.representation.DescriptorElement]

        """
        # Create DescriptorElement instances for each data elem.
        #: :type: dict[collections.Hashable, smqtk.representation.DataElement]
        data_elements = {}
        #: :type: dict[collections.Hashable, smqtk.representation.DescriptorElement]
        descr_elements = {}
        self._log.debug("Checking content types; aggregating data/descriptor "
                        "elements.")
        prog_rep_state = [0] * 7
        for d in data_iter:
            ct = d.content_type()
            if ct not in self.valid_content_types():
                raise ValueError("Cannot compute descriptor of content type "
                                 "'%s', (DE: %s" % (ct, d))
            data_elements[d.uuid()] = d
            descr_elements[d.uuid()] = descr_factory.new_descriptor(self.name, d.uuid())
            report_progress(self._log.debug, prog_rep_state, 1.0)
        self._log.debug("Given %d unique data elements", len(data_elements))

        # Reduce procs down to the number of elements to process if its smaller
        if len(data_elements) < (procs or multiprocessing.cpu_count()):
            procs = len(data_elements)

        # For thread safely, only use .append() and .popleft() (queue)
        uuid4proc = deque()

        def check_get_uuid(d):
            if overwrite or not d.has_vector():
                # noinspection PyUnresolvedReferences
                uuid4proc.append(d.uuid())

        p = multiprocessing.pool.ThreadPool(procs)
        try:
            p.map(check_get_uuid, descr_elements.itervalues())
        finally:
            p.close()
            p.join()
        del p
        self._log.debug("%d descriptors already computed",
                        len(data_elements) - len(uuid4proc))

        if uuid4proc:
            self._log.debug("Converting deque to tuple for segmentation")
            uuid4proc = tuple(uuid4proc)

            # Split UUIDs into groups equal to our batch size, and an option
            # tail group that is less than our batch size.
            tail_size = len(uuid4proc) % self.batch_size
            batch_groups = (len(uuid4proc) - tail_size) // self.batch_size
            self._log.debug("Processing %d batches of size %d", batch_groups,
                            self.batch_size)
            if tail_size:
                self._log.debug("Processing tail group of size %d", tail_size)

            if batch_groups:
                for g in xrange(batch_groups):
                    self._log.debug("Starting batch: %d of %d",
                                    g + 1, batch_groups)
                    batch_uuids = \
                        uuid4proc[g*self.batch_size:(g+1)*self.batch_size]
                    self._process_batch(batch_uuids, data_elements,
                                        descr_elements, procs)

            if tail_size:
                batch_uuids = uuid4proc[-tail_size:]
                self._log.debug("Starting tail batch (size=%d)",
                                len(batch_uuids))
                self._process_batch(batch_uuids, data_elements, descr_elements,
                                    procs)

        self._log.debug("forming output dict")
        return dict((data_elements[k], descr_elements[k])
                    for k in data_elements)
예제 #22
0
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True):
    """
    Method for computing the distance kernel of an array of vectors given a
    distance function that works on two supplied 1D arrays.

    For a valid distance function interface, see
    ``smqtk.utils.distance_functions.histogram_intersection_distance2``.

    :param m: An array of vectors to compute the pairwise distance kernel for.
    :type m: numpy.ndarray

    :param dist_func: Distance function
    :type dist_func: (ndarray, ndarray) -> ndarray[float] | float

    :param row_wise: If the given distance function can take a vector and a
        matrix, and computes pair-wise distances, returning a vector of
        distances between the given vector and each row of the matrix.
    :type row_wise: bool

    :param parallel: If distances should be calculated in parallel. This is true
        by default.
    :type parallel: bool

    :return: Computed symmetric distance kernel
    :rtype: numpy.ndarray

    """
    log = logging.getLogger(__name__)

    if m.ndim == 1:
        m = m[np.newaxis]

    log.info("Computing distance kernel")
    side = m.shape[0]
    mat = np.ndarray((side, side), dtype=float)

    s = [0] * 7
    if row_wise:
        log.debug("Computing row-wise distances")
        # For all rows except the last one. We'll have computed all distances by
        # the time reach m[side-1]
        if parallel:
            # noinspection PyShadowingNames
            def work_func(i):
                mat[i, i] = dist_func(m[i], m[i])
                if i < (side - 1):
                    mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :],
                                                                m[i + 1:, :])
            # Using threading for in-place modification
            s = [0] * 7
            for _ in parallel_map(work_func, range(side),
                                  use_multiprocessing=False):
                report_progress(log.debug, s, 1.)
        else:
            for i in range(side):
                # Compute col/row wise distances
                mat[i, i] = dist_func(m[i], m[i])
                if i < (side-1):
                    mat[i+1:, i] = mat[i, i+1:] = dist_func(m[i, :], m[i+1:, :])
                report_progress(log.debug, s, 1.)
    else:
        log.debug("Computing element-wise distances")
        if parallel:
            # noinspection PyShadowingNames
            def work_func(i):
                mat[i, i] = dist_func(m[i], m[i])
                # cols to the left of diagonal index for this row
                for j in range(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])
            # Using threading for in-place modification
            for _ in parallel_map(work_func, range(side),
                                  use_multiprocessing=False):
                report_progress(log.debug, s, 1.)
        else:
            for i in range(side):
                mat[i, i] = dist_func(m[i], m[i])
                # cols to the left of diagonal index for this row
                for j in range(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])
                report_progress(log.debug, s, 1.)

    return mat
예제 #23
0
    log.info("Computing descriptors")
    m = compute_many_descriptors(
        iter_valid_elements(),
        generator,
        factory,
        descriptor_index,
        batch_size=batch_size,
    )

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, 'w')
    try:
        rps = [0] * 7
        for fp, descr in m:
            cf.write("{:s},{:s}\n".format(fp, descr.uuid()))
            report_progress(log.debug, rps, 1.)
    finally:
        cf.close()

    log.info("Done")


def cli_parser():
    parser = basic_cli_parser(__doc__)

    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=0,
                        metavar='INT',
                        help="Number of files to batch together into a single "
예제 #24
0
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    # - parallel_map UUIDs to load from the configured index
    # - classify iterated descriptors

    uuids_list_filepath = args.uuids_list
    output_csv_filepath = args.csv_data
    output_csv_header_filepath = args.csv_header
    classify_overwrite = config['utility']['classify_overwrite']

    p_use_multiprocessing = \
        config['utility']['parallel']['use_multiprocessing']
    p_index_extraction_cores = \
        config['utility']['parallel']['index_extraction_cores']
    p_classification_cores = \
        config['utility']['parallel']['classification_cores']

    if not uuids_list_filepath:
        raise ValueError("No uuids_list_filepath specified.")
    elif not os.path.isfile(uuids_list_filepath):
        raise ValueError("Given uuids_list_filepath did not point to a file.")
    if output_csv_header_filepath is None:
        raise ValueError("Need a path to save CSV header labels")
    if output_csv_filepath is None:
        raise ValueError("Need a path to save CSV data.")

    #
    # Initialize configured plugins
    #

    log.info("Initializing descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'],
        get_descriptor_index_impls()
    )

    log.info("Initializing classification factory")
    c_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory']
    )

    log.info("Initializing classifier")
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(
        config['plugins']['classifier'], get_classifier_impls()
    )

    #
    # Setup/Process
    #
    def iter_uuids():
        with open(uuids_list_filepath) as f:
            for l in f:
                yield l.strip()

    def descr_for_uuid(uuid):
        """
        :type uuid: collections.Hashable
        :rtype: smqtk.representation.DescriptorElement
        """
        return descriptor_index.get_descriptor(uuid)

    def classify_descr(d):
        """
        :type d: smqtk.representation.DescriptorElement
        :rtype: smqtk.representation.ClassificationElement
        """
        return classifier.classify(d, c_factory, classify_overwrite)

    log.info("Initializing uuid-to-descriptor parallel map")
    #: :type: collections.Iterable[smqtk.representation.DescriptorElement]
    element_iter = parallel.parallel_map(
        descr_for_uuid, iter_uuids(),
        use_multiprocessing=p_use_multiprocessing,
        cores=p_index_extraction_cores,
        name="descr_for_uuid",
    )

    log.info("Initializing descriptor-to-classification parallel map")
    #: :type: collections.Iterable[smqtk.representation.ClassificationElement]
    classification_iter = parallel.parallel_map(
        classify_descr, element_iter,
        use_multiprocessing=p_use_multiprocessing,
        cores=p_classification_cores,
        name='classify_descr',
    )

    #
    # Write/Output files
    #

    c_labels = classifier.get_labels()

    def make_row(e):
        """
        :type e: smqtk.representation.ClassificationElement
        """
        c_m = e.get_classification()
        return [e.uuid] + [c_m[l] for l in c_labels]

    # column labels file
    log.info("Writing CSV column header file: %s", output_csv_header_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath))
    with open(output_csv_header_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        w.writerow(['uuid'] + [str(cl) for cl in c_labels])

    # CSV file
    log.info("Writing CSV data file: %s", output_csv_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_filepath))
    r_state = [0] * 7
    with open(output_csv_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        for c in classification_iter:
            w.writerow(make_row(c))
            bin_utils.report_progress(log.info, r_state, 1.0)

    # Final report
    r_state[1] -= 1
    bin_utils.report_progress(log.info, r_state, 0)

    log.info("Done")