Пример #1
0
def main(args):
    data_dir_1 = args.data_dir_1
    data_dir_2 = args.data_dir_2
    keyword = args.keyword
    name_data_1 = args.name_1
    name_data_2 = args.name_2
    output_dir = args.output_dir

    container = EmbeddingContainer(name=name_data_1)
    container.load(data_dir_1)

    db_container = EmbeddingContainer(name=name_data_2)
    db_container.load(data_dir_2)

    container.merge(db_container, merge_key=keyword)
    db_container.clear()

    print(container)
    print(container.DataFrame)

    container.save(output_dir)
Пример #2
0
def create_embedding_container_from_featobj(folder_path, verbose=True):
    """Directly load feature object into embedding container.
      Args:
        folder_path: string, path to the folder of FeatureObject
        verbose: Boolean, show the size of feature object if set True
      Return:
        container: EmbeddingContainer
    """
    feature_importer = FeatureObject()
    feature_importer.load(folder_path)
    embeddings = feature_importer.embeddings
    filenames = feature_importer.filename_strings
    instance_ids = feature_importer.instance_ids

    labels = feature_importer.label_ids
    label_names = feature_importer.label_names
    probabilities = feature_importer.probabilities
    has_label_name = True if label_names is not None else False
    # TODO
    has_prob = True if probabilities is not None else False

    # pseudo instance_ids
    pseudo_instance_ids = np.arange(embeddings.shape[0])

    if instance_ids is None or instance_ids.size == 0:
        instance_ids = pseudo_instance_ids

    num_feature, dim_feature = embeddings.shape
    if verbose:
        print('{} features with dim-{} are loaded'.format(
            num_feature, dim_feature))

    # err handling: label_ids.shape == 0

    container = EmbeddingContainer(embedding_size=dim_feature,
                                   prob_size=0,
                                   container_size=num_feature)
    if not has_label_name:
        for inst_id, feat, label in zip(instance_ids, embeddings, labels):
            # use filename_string as instance_id, convert to integer
            #for postfix in ['.png', '.jpg', '.jpeg', '.JPG']:
            #    fn = fn.replace(postfix, '')
            #pseudo_instance_id = int(fn)
            inst_id = int(inst_id)
            container.add(inst_id, label, feat)
    else:
        for inst_id, feat, label, name in zip(instance_ids, embeddings, labels,
                                              label_names):
            inst_id = int(inst_id)
            container.add(inst_id, label, feat, label_name=name)

    return container
Пример #3
0
def main(args):
    data_dir = args.data_dir
    out_dir = args.out_dir

    if data_dir is None:
        raise ValueError('data_dir or database should be assigned.')

    container = EmbeddingContainer()
    container.load(data_dir)

    hg = HierarchicalGrouping(container)
    df = hg.auto_label_subgroup(label_id=1)
    print(df)
Пример #4
0
def main(args):

    data_dir = args.data_dir
    out_dir = args.output_dir

    container = EmbeddingContainer()
    container.load(data_dir)

    tsne = TSNE(container,
                n_iter=args.iterations,
                n_jobs=args.n_jobs,
                perplexity=args.perplexity)

    tsne.run()
    tsne.save_fig(out_dir)
Пример #5
0
def main(args):

    data_dir = args.data_dir
    out_dir = args.out_dir
    query_command = args.query_command
    anchor_command = args.anchor_command
    # TODO: sanity check

    container = EmbeddingContainer()
    result = ResultContainer()
    container.load(data_dir)

    command = '{}->{}'.format(query_command, anchor_command)
    query_ids, anchor_ids = container.get_instance_id_by_cross_reference_command(command)
    query_embeddings = container.get_embedding_by_instance_ids(query_ids)
    anchor_embeddings = container.get_embedding_by_instance_ids(anchor_ids)

    num_of_anchor = anchor_embeddings.shape[0]
    num_of_query = query_embeddings.shape[0]

    agent = IndexAgent(agent_type='HNSW',
                       instance_ids=anchor_ids,
                       embeddings=anchor_embeddings)

    all_query_ids, all_retrieved_ids, all_retrieved_distances = [], [], []
    with tqdm(total=num_of_query) as pbar:
        for _idx, (query_id, qeury_emb) in enumerate(zip(query_ids, query_embeddings)):
            retrieved_ids, retrieved_distances = agent.search(qeury_emb, top_k=num_of_anchor)
            retrieved_ids = np.squeeze(retrieved_ids)
            retrieved_distances = np.squeeze(retrieved_distances)

            all_query_ids.extend(np.array(query_id).repeat(num_of_anchor))
            all_retrieved_ids.extend(retrieved_ids)
            all_retrieved_distances.extend(retrieved_distances)

            pbar.update()
    print('Indexing finished, {} retrieved events'.format(len(all_retrieved_ids)))
    print('Start exporting results...')
    start_time = time()
    result._event_buffer = pd.DataFrame(
        {
            Fields.query_instance_id: all_query_ids,
            Fields.retrieved_instance_id: all_retrieved_ids,
            Fields.retrieved_distance: all_retrieved_distances,
        }
    )
    result.save(out_dir)
    print('Done, saving results take {} seconds.'.format(time() - start_time))
Пример #6
0
    def __init__(self, embedding_size, prob_size, config_dict, mode='online'):
        """Evaluator Builder.

          The object builds evaluation functions according to the given configuration
          and manage shared data (embeddings, labels and attributes) in container objects.

          Args:
            embedding_size: Integer describes 1d embedding size.
            prob_size: Integer describes size of the logits.
            config_dict: Dict, loaded yaml foramt dict.
            mode: String, `online` or `offline`.

          Building procedure: (TODO @kv: update these steps)
            * parse the config
            * allocate containers
            * create evaluations
            * add datum
            * run evaluate
            * (optional) get update_ops
        TODO:
            - deprecate attribute container
        """
        self.configs = ConfigParser(config_dict)

        # allocate shared embedding containers
        container_size = self.configs.container_size
        self.embedding_size = embedding_size
        self.prob_size = prob_size

        self.embedding_container = EmbeddingContainer(embedding_size, prob_size, container_size)

        self.mode = mode
        if self.mode not in ['online', 'offline']:
            raise ValueError('Evaluator mode: {} is not defined.'.format(self.mode))

        self._build()

        self._instance_counter = 0
        self._total_metrics = {}
        self._results = {}
        # Allocate general query interface
        if not self.configs.database[config_fields.database_type]:
            # TODO @kv: consistent check with query condition
            print('No attribute database')
            self.query_interface = None
        else:
            self.query_interface = QueryInterface(self.configs.database)
            print('Attribute database is initialized.')
Пример #7
0
def main(args):

    if args.data_dir is None:
        return

    container = EmbeddingContainer()
    container.load(args.data_dir)

    all_embeddings = container.embeddings
    instance_ids = container.instance_ids
    all_embeddings = container.get_embedding_by_instance_ids(instance_ids)
    agent = IndexAgent('HNSW', instance_ids, all_embeddings, distance_measure='ip')

    print(container)

    for label_id in container.label_ids:
        same_class_inst_ids = container.get_instance_ids_by_label_ids(label_id)
        same_class_embeddings = container.get_embedding_by_instance_ids(same_class_inst_ids)
        num_inst_same_class = len(same_class_inst_ids)
        retrieved_indexes, similarities = agent.search(
            same_class_embeddings, top_k=2*num_inst_same_class, is_similarity=True)
        break
Пример #8
0
def main(args):
    data_dir = args.data_dir
    out_dir = args.out_dir

    if data_dir is None:
        raise ValueError('data_dir or database should be assigned.')

    feature_object = FeatureObject()
    feature_object.load(data_dir)

    embeddings = feature_object.embeddings
    embeddings = np.squeeze(embeddings)
    filename_strings = feature_object.filename_strings
    label_ids = feature_object.label_ids
    label_names = feature_object.label_names
    instance_ids = np.arange(embeddings.shape[0])

    # Push all embeddings into container
    embedding_container = EmbeddingContainer(
        embedding_size=embeddings.shape[1],
        prob_size=0,
        container_size=embeddings.shape[0])

    for emb, inst_id, label_id in zip(embeddings, instance_ids, label_ids):
        embedding_container.add(inst_id, label_id, emb)

    manifold = Manifold(embedding_container, label_names)
    centers = manifold.class_center()

    c2c_matrix = manifold.center_to_center_relation()
    c2all_relation = manifold.center_to_all_instance_relation()

    for center_label, center_feature in centers.items():
        manifold.distance_trace(center_label, center_feature, 200)

    manifold.locality_analysis()
Пример #9
0
def main():
    args = parser.parse_args()
    config_path = args.config
    data_type = args.data_type
    data_dir = args.data_dir
    out_dir = args.out_dir
    anchor_database_dir = args.anchor_database

    status = status_fields.not_determined

    # check input is given
    if not data_dir:
        raise ValueError('data_dir must be assigned!')

    if out_dir is not None:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

    # argument logic
    if data_dir and anchor_database_dir:
        status = status_fields.evaluate_query_anchor
    elif data_dir and anchor_database_dir is None:
        status = status_fields.evaluate_single_container

    if not config_path:
        # TODO: @kv: Generate the default config.
        raise ValueError('evaluation configuration must be assigned!')
    try:
        with open(config_path, 'r') as fp:
            config_dict = yaml.load(fp)
    except:
        raise IOError('Can not load yaml from {}.'.format(config_path))
        # TODO: create default config instead of error.

    # Prepare data container
    container = None
    for case in switch(status):
        print('{} Executes {}'.format(APP_SIGNATURE, status))
        if case(status_fields.evaluate_single_container):
            container = EmbeddingContainer(name='single_container')
            if data_type in ['embedding_container', 'embedding_db']:
                container.load(data_dir)
            # end of switch case
            break

        if case(status_fields.evaluate_query_anchor):
            """TODO: Use native method: merge()
              1. Merge two containers
              2. Add `query->anchor` command in cross_reference
              3. Change number of database
            """

            container = EmbeddingContainer(name='query')
            anchor_container = EmbeddingContainer(name='anchor')
            # load query
            if data_type in ['embedding_container', 'embedding_db']:
                container.load(data_dir)
            # load anchor
            if data_type in ['embedding_container', 'embedding_db']:
                anchor_container.load(anchor_database_dir)

            container.merge(anchor_container,
                            merge_key='merge_record',
                            label_id_rearrange=True)
            # clear buffer
            anchor_container.clear()

            # Change config TODO: A little bit hacky, modify in future
            # TODO: It seems not work well
            _opt = config_fields.evaluation_options
            _rank = 'RankingEvaluation'
            _attr = config_fields.attribute
            _cref = config_fields.cross_reference
            _smp = config_fields.sampling
            _cmd = 'merge_record.query -> merge_record.anchor'
            config_dict[_opt][_rank][_attr][_cref] = list(
                filter(None, config_dict[_opt][_rank][_attr][_cref]))
            if _cmd not in config_dict[_opt][_rank][_attr][_cref]:
                config_dict[_opt][_rank][_attr][_cref].append(_cmd)
            config_dict[_opt][_rank][_smp][
                'num_of_db_instance_per_class'] = 1000
            # end of switch case
            break

    # Build and run evaluation
    evaluator = EvaluatorBuilder(args.embedding_size,
                                 args.prob_size,
                                 config_dict,
                                 mode='offline')
    print(container)
    evaluator.add_container(container)
    evaluator.evaluate()

    # Show Results
    for eval_name, result_container in evaluator.results.items():
        print(eval_name)
        display_name = display_namemap[
            eval_name] if eval_name in display_namemap else eval_name
        reporter = ReportWriter(result_container)
        overall_report = reporter.overall_report
        print(overall_report)
        if out_dir:
            path = '/'.join([out_dir, 'result_{}'.format(display_name)])
            result_container.save(path)

    if status == status_fields.evaluate_query_anchor and out_dir:
        path = '/'.join([out_dir, 'merged_container'])
        container.save(path)