示例#1
0
def main(args):
    data_dir = args.data_dir
    out_dir = args.out_dir

    if data_dir is None:
        raise ValueError('data_dir or database should be assigned.')

    container = EmbeddingContainer()
    container.load(data_dir)

    hg = HierarchicalGrouping(container)
    df = hg.auto_label_subgroup(label_id=1)
    print(df)
示例#2
0
    def __init__(self, embedding_size, prob_size, config_dict, mode='online'):
        """Evaluator Builder.

          The object builds evaluation functions according to the given configuration
          and manage shared data (embeddings, labels and attributes) in container objects.

          Args:
            embedding_size: Integer describes 1d embedding size.
            prob_size: Integer describes size of the logits.
            config_dict: Dict, loaded yaml foramt dict.
            mode: String, `online` or `offline`.

          Building procedure: (TODO @kv: update these steps)
            * parse the config
            * allocate containers
            * create evaluations
            * add datum
            * run evaluate
            * (optional) get update_ops
        TODO:
            - deprecate attribute container
        """
        self.configs = ConfigParser(config_dict)

        # allocate shared embedding containers
        container_size = self.configs.container_size
        self.embedding_size = embedding_size
        self.prob_size = prob_size

        self.embedding_container = EmbeddingContainer(embedding_size,
                                                      prob_size,
                                                      container_size)

        self.mode = mode
        if self.mode not in ['online', 'offline']:
            raise ValueError('Evaluator mode: {} is not defined.'.format(
                self.mode))

        self._build()

        self._instance_counter = 0
        self._total_metrics = {}
        self._results = {}
        # Allocate general query interface
        if not self.configs.database[config_fields.database_type]:
            # TODO @kv: consistent check with query condition
            print('No attribute database')
            self.query_interface = None
        else:
            self.query_interface = QueryInterface(self.configs.database)
            print('Attribute database is initialized.')
示例#3
0
def main(args):

    data_dir = args.data_dir
    out_dir = args.output_dir

    container = EmbeddingContainer()
    container.load(data_dir)

    tsne = TSNE(container,
                n_iter=args.iterations,
                n_jobs=args.n_jobs,
                perplexity=args.perplexity)

    tsne.run()
    tsne.save_fig(out_dir)
示例#4
0
def main(args):

    data_dir = args.data_dir
    out_dir = args.out_dir
    query_command = args.query_command
    anchor_command = args.anchor_command
    # TODO: sanity check

    container = EmbeddingContainer()
    result = ResultContainer()
    container.load(data_dir)

    command = '{}->{}'.format(query_command, anchor_command)
    query_ids, anchor_ids = container.get_instance_id_by_cross_reference_command(
        command)
    query_embeddings = container.get_embedding_by_instance_ids(query_ids)
    anchor_embeddings = container.get_embedding_by_instance_ids(anchor_ids)

    num_of_anchor = anchor_embeddings.shape[0]
    num_of_query = query_embeddings.shape[0]

    agent = IndexAgent(agent_type='HNSW',
                       instance_ids=anchor_ids,
                       embeddings=anchor_embeddings)

    all_query_ids, all_retrieved_ids, all_retrieved_distances = [], [], []
    with tqdm(total=num_of_query) as pbar:
        for _idx, (query_id,
                   qeury_emb) in enumerate(zip(query_ids, query_embeddings)):
            retrieved_ids, retrieved_distances = agent.search(
                qeury_emb, top_k=num_of_anchor)
            retrieved_ids = np.squeeze(retrieved_ids)
            retrieved_distances = np.squeeze(retrieved_distances)

            all_query_ids.extend(np.array(query_id).repeat(num_of_anchor))
            all_retrieved_ids.extend(retrieved_ids)
            all_retrieved_distances.extend(retrieved_distances)

            pbar.update()
    print('Indexing finished, {} retrieved events'.format(
        len(all_retrieved_ids)))
    print('Start exporting results...')
    start_time = time()
    result._event_buffer = pd.DataFrame({
        Fields.query_instance_id:
        all_query_ids,
        Fields.retrieved_instance_id:
        all_retrieved_ids,
        Fields.retrieved_distance:
        all_retrieved_distances,
    })
    result.save(out_dir)
    print('Done, saving results take {} seconds.'.format(time() - start_time))
示例#5
0
def main(args):
    data_dir = args.data_dir
    out_dir = args.out_dir

    if data_dir is None:
        raise ValueError('data_dir or database should be assigned.')

    feature_object = FeatureObject()
    feature_object.load(data_dir)

    embeddings = feature_object.embeddings
    embeddings = np.squeeze(embeddings)
    filename_strings = feature_object.filename_strings
    label_ids = feature_object.label_ids
    label_names = feature_object.label_names
    instance_ids = np.arange(embeddings.shape[0])

    # Push all embeddings into container
    embedding_container = EmbeddingContainer(
        embedding_size=embeddings.shape[1],
        prob_size=0,
        container_size=embeddings.shape[0])

    for emb, inst_id, label_id in zip(embeddings, instance_ids, label_ids):
        embedding_container.add(inst_id, label_id, emb)

    manifold = Manifold(embedding_container, label_names)
    centers = manifold.class_center()

    c2c_matrix = manifold.center_to_center_relation()
    c2all_relation = manifold.center_to_all_instance_relation()

    for center_label, center_feature in centers.items():
        manifold.distance_trace(center_label, center_feature, 200)

    manifold.locality_analysis()
示例#6
0
def create_embedding_container_from_featobj(folder_path, verbose=True):
    """Directly load feature object into embedding container.
      Args:
        folder_path: string, path to the folder of FeatureObject
        verbose: Boolean, show the size of feature object if set True
      Return:
        container: EmbeddingContainer
    """
    feature_importer = FeatureObject()
    feature_importer.load(folder_path)
    embeddings = feature_importer.embeddings
    filenames = feature_importer.filename_strings
    instance_ids = feature_importer.instance_ids

    labels = feature_importer.label_ids
    label_names = feature_importer.label_names
    probabilities = feature_importer.probabilities
    has_label_name = True if label_names is not None else False
    # TODO
    has_prob = True if probabilities is not None else False

    # pseudo instance_ids
    pseudo_instance_ids = np.arange(embeddings.shape[0])

    if instance_ids is None or instance_ids.size == 0:
        instance_ids = pseudo_instance_ids

    num_feature, dim_feature = embeddings.shape
    if verbose:
        print('{} features with dim-{} are loaded'.format(num_feature, dim_feature))

    # err handling: label_ids.shape == 0

    container = EmbeddingContainer(embedding_size=dim_feature, 
        prob_size=0, container_size=num_feature)
    if not has_label_name:
        for inst_id, feat, label in zip(instance_ids, embeddings, labels):
            # use filename_string as instance_id, convert to integer
            #for postfix in ['.png', '.jpg', '.jpeg', '.JPG']:
            #    fn = fn.replace(postfix, '')
            #pseudo_instance_id = int(fn)
            inst_id = int(inst_id)
            container.add(inst_id, label, feat)
    else:
        for inst_id, feat, label, name in zip(instance_ids, embeddings, labels, label_names):
            inst_id = int(inst_id)
            container.add(inst_id, label, feat, label_name=name)

    return container
示例#7
0
def main():
    args = parser.parse_args()
    config_path = args.config
    data_type = args.data_type
    data_dir = args.data_dir
    out_dir = args.out_dir
    anchor_database_dir = args.anchor_database

    status = status_fields.not_determined

    # check input is given
    if not data_dir:
        raise ValueError('data_dir must be assigned!')

    if out_dir is not None:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

    # argument logic
    if data_dir and anchor_database_dir:
        status = status_fields.evaluate_query_anchor
    elif data_dir and anchor_database_dir is None:
        status = status_fields.evaluate_single_container

    if not config_path:
        # TODO: @kv: Generate the default config.
        raise ValueError('evaluation configuration must be assigned!')
    try:
        with open(config_path, 'r') as fp:
            config_dict = yaml.load(fp)
    except:
        raise IOError('Can not load yaml from {}.'.format(config_path))
        # TODO: create default config instead of error.

    # Prepare data container
    container = None
    for case in switch(status):
        print('{} Executes {}'.format(APP_SIGNATURE, status))
        if case(status_fields.evaluate_single_container):
            container = EmbeddingContainer(name='single_container')
            if data_type in ['embedding_container', 'embedding_db']:
                container.load(data_dir)
            # end of switch case
            break

        if case(status_fields.evaluate_query_anchor):
            """TODO: Use native method: merge()
              1. Merge two containers
              2. Add `query->anchor` command in cross_reference
              3. Change number of database
            """

            container = EmbeddingContainer(name='query')
            anchor_container = EmbeddingContainer(name='anchor')
            # load query
            if data_type in ['embedding_container', 'embedding_db']:
                container.load(data_dir)
            # load anchor
            if data_type in ['embedding_container', 'embedding_db']:
                anchor_container.load(anchor_database_dir)

            container.merge(anchor_container,
                            merge_key='merge_record',
                            label_id_rearrange=True)
            # clear buffer
            anchor_container.clear()

            # Change config TODO: A little bit hacky, modify in future
            # TODO: It seems not work well
            _opt = config_fields.evaluation_options
            _rank = 'RankingEvaluation'
            _attr = config_fields.attribute
            _cref = config_fields.cross_reference
            _smp = config_fields.sampling
            _cmd = 'merge_record.query -> merge_record.anchor'
            config_dict[_opt][_rank][_attr][_cref] = list(
                filter(None, config_dict[_opt][_rank][_attr][_cref]))
            if _cmd not in config_dict[_opt][_rank][_attr][_cref]:
                config_dict[_opt][_rank][_attr][_cref].append(_cmd)
            config_dict[_opt][_rank][_smp][
                'num_of_db_instance_per_class'] = 1000
            # end of switch case
            break

    # Build and run evaluation
    evaluator = EvaluatorBuilder(args.embedding_size,
                                 args.prob_size,
                                 config_dict,
                                 mode='offline')
    print(container)
    evaluator.add_container(container)
    evaluator.evaluate()

    # Show Results
    for eval_name, result_container in evaluator.results.items():
        print(eval_name)
        display_name = display_namemap[
            eval_name] if eval_name in display_namemap else eval_name
        reporter = ReportWriter(result_container)
        overall_report = reporter.overall_report
        print(overall_report)
        if out_dir:
            path = '/'.join([out_dir, 'result_{}'.format(display_name)])
            result_container.save(path)

    if status == status_fields.evaluate_query_anchor and out_dir:
        path = '/'.join([out_dir, 'merged_container'])
        container.save(path)
示例#8
0
def main(args):
    data_dir_1 = args.data_dir_1
    data_dir_2 = args.data_dir_2
    keyword = args.keyword
    name_data_1 = args.name_1
    name_data_2 = args.name_2
    output_dir = args.output_dir

    container = EmbeddingContainer(name=name_data_1)
    container.load(data_dir_1)

    db_container = EmbeddingContainer(name=name_data_2)
    db_container.load(data_dir_2)

    container.merge(db_container, merge_key=keyword)
    db_container.clear()

    print(container)
    print(container.DataFrame)

    container.save(output_dir)
示例#9
0
class EvaluatorBuilder(object):
    """Evaluator Builder & Interface.
    """
    def __init__(self, embedding_size, prob_size, config_dict, mode='online'):
        """Evaluator Builder.

          The object builds evaluation functions according to the given configuration
          and manage shared data (embeddings, labels and attributes) in container objects.

          Args:
            embedding_size: Integer describes 1d embedding size.
            prob_size: Integer describes size of the logits.
            config_dict: Dict, loaded yaml foramt dict.
            mode: String, `online` or `offline`.

          Building procedure: (TODO @kv: update these steps)
            * parse the config
            * allocate containers
            * create evaluations
            * add datum
            * run evaluate
            * (optional) get update_ops
        TODO:
            - deprecate attribute container
        """
        self.configs = ConfigParser(config_dict)

        # allocate shared embedding containers
        container_size = self.configs.container_size
        self.embedding_size = embedding_size
        self.prob_size = prob_size

        self.embedding_container = EmbeddingContainer(embedding_size,
                                                      prob_size,
                                                      container_size)

        self.mode = mode
        if self.mode not in ['online', 'offline']:
            raise ValueError('Evaluator mode: {} is not defined.'.format(
                self.mode))

        self._build()

        self._instance_counter = 0
        self._total_metrics = {}
        self._results = {}
        # Allocate general query interface
        if not self.configs.database[config_fields.database_type]:
            # TODO @kv: consistent check with query condition
            print('No attribute database')
            self.query_interface = None
        else:
            self.query_interface = QueryInterface(self.configs.database)
            print('Attribute database is initialized.')

    def _build(self):
        """
          Build:
            Parse the config and create evaluators.
        """
        # Allocate evaluation object with corresponding configuration
        self.evaluations = {}
        for eval_name in self.configs.chosen_evaluation_names:
            if eval_name == eval_fields.classification and self.prob_size == 0:
                print(
                    '{} is assigned, but prob_size == 0, remove from the chosen list.'
                    .format(eval_name))
                # remove the chosen name in the list
                self.configs.chosen_evaluation_names.remove(eval_name)
                continue
            eval_config = self.configs.get_eval_config(eval_name)
            self.evaluations[eval_name] = REGISTERED_EVALUATION_OBJECTS[
                eval_name](eval_config, self.mode)

    @property
    def evaluation_names(self):
        # NOTE: evaluation_types from config; evaluation_names from object instance.
        return self.configs.chosen_evaluation_names

    @property
    def metric_names(self):
        _metric_names = []
        for _eval_name in self.configs.chosen_evaluation_names:
            if _eval_name in EVALUATION_DISPLAY_NAMES:
                _display_eval_name = EVALUATION_DISPLAY_NAMES[_eval_name]
            else:
                _display_eval_name = _eval_name
            _metric_name_per_evaluation = self.evaluations[
                _eval_name].metric_names
            for _metric_name in _metric_name_per_evaluation:
                _metric_name = '{}/{}'.format(_display_eval_name, _metric_name)
                _metric_names.append(_metric_name)
        return _metric_names

    def add_instance_id_and_embedding(self,
                                      instance_id,
                                      label_id,
                                      embedding,
                                      probability=None):
        """Add embedding and label for a sample to be used for evaluation.

           If the query attribute names are given in config, this function will
           search them on database automatically.

        Args:
            instance_id, integer:
                A integer identifier for the image. instance_id
            label_id: An interger to describe class
            embedding, list or numpy array:
                Embedding, feature vector
        """

        # NOTE: If we call classification, then add probability.
        # TODO @kv: If instance_id is None, use index as default.
        if instance_id is None or instance_id == -1:
            instance_id = self._instance_counter

        if not isinstance(instance_id, int):
            instance_id = int(instance_id)
        if not isinstance(label_id, int):
            label_id = int(label_id)

        if self.query_interface:
            queried_attributes = self.query_interface.query(instance_id)
            self.embedding_container.add(instance_id,
                                         label_id,
                                         embedding,
                                         probability,
                                         attribute=queried_attributes)
        else:
            self.embedding_container.add(instance_id, label_id, embedding,
                                         probability)

        # verbose for developing stage.
        if self.embedding_container.counts % 1000 == 0:
            if probability is None:
                print('{} embeddings are added.'.format(
                    self.embedding_container.counts))
            else:
                print('{} embeddings and probabilities are added.'.format(
                    self.embedding_container.counts))

        self._instance_counter += 1

    def add_container(self, embedding_container=None):
        """Add filled containers which should be provided previously.

          Args:
            embedding_container: EmbeddingContainer, default is None.
          Notice:
            Sanity check:
          TODO @kv: Think about how to cooperate with attributes
        """
        # replace container
        if embedding_container is not None:
            if not isinstance(embedding_container, EmbeddingContainer):
                # raise error
                return
            self.embedding_container.clear()
            self.embedding_container = embedding_container
            print('Update embedding container.')

    def evaluate(self):
        """Execute given evaluations and returns a dictionary of metrics.
          Return:
            total_metrics: A flatten dictionary for display each measures
        """
        for _eval_name, _evaluation in self.evaluations.items():
            # Pass the container to the evaluation objects.
            res_container = _evaluation.compute(self.embedding_container)
            self._results[_eval_name] = res_container

            # TODO: flatten results and return
            if _eval_name in EVALUATION_DISPLAY_NAMES:
                _display_name = EVALUATION_DISPLAY_NAMES[_eval_name]
            else:
                _display_name = _eval_name
            if res_container:
                self._total_metrics[_display_name] = res_container.flatten
            else:
                self._total_metrics[_display_name] = {}
        flatten = {}
        for _eval_name, _content in self._total_metrics.items():
            for _metric, _value in _content.items():
                _combined_name = '{}/{}'.format(_eval_name, _metric)
                flatten[_combined_name] = _value
        return flatten

    @property
    def results(self):
        return self._results

    def clear(self):
        """Clears the state to prepare for a fresh evaluation."""
        self.embedding_container.clear()
        for _, _container in self._total_metrics.items():
            _container.clear()