示例#1
0
def check_index_recon(embeds_path,
                      index_or_index_path,
                      embeds_format='labeled_numpy',
                      sort=True,
                      **kwargs):
    index = faiss.read_index(index_or_index_path) if isinstance(
        index_or_index_path, str) else index_or_index_path
    faiss.downcast_index(index).make_direct_map()
    embeds_list, _ = load_embeds(embeds_path=embeds_path,
                                 format=embeds_format,
                                 sort=sort,
                                 **kwargs)

    # tic("Gathering targets ...")
    # all_tgt_embeds = []
    # for file_path in Tqdm.tqdm(embeds_paths):
    #     embeds_group, batch_group = pickle_load(file_path)
    #     for embeds, batch in zip(embeds_group[embeds_key], batch_group):
    #         all_tgt_embeds.append(embeds[embeds_idx])
    #
    # toc("Done!")

    tic("Checking embedding reconstruction difference ...")
    all_embeds = np.concatenate(embeds_list)
    all_embeds_recon = index.reconstruct_n(0, len(all_embeds))
    embeds_diff = np.linalg.norm(all_embeds - all_embeds_recon)
    toc("Passed embedding reconstruction difference check.") \
        if embeds_diff == 0 else toc(f"Embedding reconstruction difference: {embeds_diff}.")
示例#2
0
文件: mp_ext.py 项目: XinliYu/utix
def parallel_process_by_queue(num_p,
                              data_iter,
                              target,
                              args,
                              ctx: BaseContext = None,
                              task_unit_size=5000,
                              print_out=__debug__):
    if isinstance(target, MPTarget):
        target.use_queue = True
    if ctx is None:
        ctx = get_context('spawn')
    iq = Queue(ctx=ctx)
    oq: Manager = ctx.Manager().Queue()

    tic(f"Creating input queue with task unit size {task_unit_size}",
        verbose=print_out)
    cnt_task_unit = 0
    for item in tqdm(slices__(data_iter, task_unit_size)):
        iq.put(item)
        cnt_task_unit += 1
    jobs = [None] * num_p
    for i in range(num_p):
        jobs[i] = ctx.Process(target=target, args=(i, iq, oq) + args)
    toc()

    tic(f"Working on {cnt_task_unit} task units with {num_p} processes",
        verbose=print_out)
    start_and_wait_jobs(jobs)

    out = []
    while not oq.empty():
        out.append(oq.get_nowait())
    toc()
    return out
示例#3
0
文件: mp_ext.py 项目: XinliYu/utix
def dispatch_data(num_p: int,
                  data_iter: Union[Iterator, Iterable, List],
                  args: Tuple,
                  print_out=__debug__):
    if num_p <= 0:
        raise ValueError(
            f"The number of processes specified in `nump_p` must be positive, but it is {num_p}."
        )

    tic("Splitting task", verbose=print_out)
    splits = split_iter(it=data_iter, num_splits=num_p, use_tqdm=print_out)
    toc(print_out=print_out)

    num_p = len(splits)
    if num_p == 0:
        raise ValueError(
            f"The number of data splits is zero. Possibly no data was read from the provided iterator."
        )
    else:
        job_args = [None] * num_p
        for pidx in range(num_p):
            if print_out:
                hprint_pairs(('pid', pidx), ('workload', len(splits[pidx])))
            job_args[pidx] = (pidx, splits[pidx]) + args
        return job_args
示例#4
0
def iter_feature_data(csv_file_path,
                      num_meta_data_fields=0,
                      num_label_fields=1,
                      use_tqdm=True,
                      disp_msg=None,
                      verbose=__debug__,
                      fields_as_list=True,
                      parse_labels_as_ints=False,
                      parse_feats_as_floats=False,
                      parse=False,
                      replace_nan=None,
                      num_p=1):
    """

    NOTE this is multi-processing wrap for the actual csv-based feature data reading by the private `_iter_feature_data` method.
    """
    if num_p <= 1:
        return _iter_feature_data(csv_file_path=csv_file_path,
                                  num_meta_data_fields=num_meta_data_fields,
                                  num_label_fields=num_label_fields,
                                  use_tqdm=use_tqdm,
                                  disp_msg=disp_msg,
                                  verbose=verbose,
                                  fields_as_list=fields_as_list,
                                  parse_labels_as_ints=parse_labels_as_ints,
                                  parse_feats_as_floats=parse_feats_as_floats,
                                  parse=parse,
                                  replace_nan=replace_nan)
    else:
        import utix.mpex as mpex
        timex.tic(
            f"Loading L1 feature file at {csv_file_path} with multi-processing"
        )
        rst = mpex.mp_read_from_files(
            num_p=num_p,
            input_path=csv_file_path,
            target=mpex.MPTarget(target=partial(
                _iter_feature_data,
                num_meta_data_fields=num_meta_data_fields,
                num_label_fields=num_label_fields,
                use_tqdm=use_tqdm,
                disp_msg=disp_msg,
                verbose=verbose,
                fields_as_list=fields_as_list,
                parse_labels_as_ints=parse_labels_as_ints,
                parse_feats_as_floats=parse_feats_as_floats,
                parse=parse,
                replace_nan=replace_nan),
                                 pass_pid=False,
                                 pass_each=True,
                                 is_target_iter=True),
            result_merge='chain')
        timex.toc()
        return rst
示例#5
0
def load_embeds(embeds_path,
                format='labeled_numpy',
                read_embeds=True,
                read_labels=True,
                use_tqdm: bool = True,
                tqdm_msg: str = None,
                sort=True,
                **kwargs):
    if tqdm_msg is None:
        if read_embeds and read_labels:
            tqdm_msg = f'loading embeds with labels at {embeds_path}'
        elif read_embeds:
            tqdm_msg = f'loading embeds at {embeds_path}'
        elif read_labels:
            tqdm_msg = f'loading labels at {embeds_path}'
        else:
            return
    embeds_it = iter_embeds(embeds_path=embeds_path,
                            format=format,
                            read_embeds=read_embeds,
                            read_labels=read_labels,
                            use_tqdm=use_tqdm,
                            tqdm_msg=tqdm_msg,
                            sort=sort,
                            **kwargs)
    tic('Load embeddings ...')
    if format == 'labeled_numpy':
        output = list(embeds_it)
        if read_embeds and read_labels:
            embeds_list, labels_list = gx.unzip(output)
            gx.hprint_message(
                f"Total number of embedding batches at {embeds_path} to index",
                len(embeds_list))
            output = (embeds_list, labels_list)
        elif read_embeds or read_labels:
            gx.hprint_message(
                f"Total number of embedding batches at {embeds_path} to index",
                len(output))
    else:
        raise NotImplementedError('the embedding file format is not supported')

    toc(msg=f'Done!')
    return output
示例#6
0
文件: mp_read.py 项目: XinliYu/utix
from functools import partial

import utix.mpex as mpex
import utix.ioex as ioex
import utix.pathex as paex
import utix.strex as strex
import utix.timex as timex

if __name__ == '__main__':
    src = r'E:\Data\dfsv1f\source_data\main_data\features_2020223\slot_value_features.json'
    trg = r'./tmp1.txt'

    timex.tic('with mp iter')
    it = mpex.mp_read(data_iter=[src],
                      provider=mpex.MPProvider(create_iterator=partial(ioex.iter_all_lines_from_all_files, use_tqdm=True), chunk_size=1000),
                      producer=mpex.MPTarget(target=strex.hash_str, pass_each=True, pass_pid=False), )
    hashes1 = list(it)
    timex.toc()

    timex.tic('no mp iter')
    hashes2 = [strex.hash_str(x) for x in ioex.iter_all_lines_from_all_files(src)]
    timex.toc()

    print(hashes1.sort() == hashes2.sort())
示例#7
0
def build_index(embeds_path,
                output_path,
                num_clusters=65536,
                use_gpu=False,
                train_ratio=1.0,
                embeds_format='labeled_numpy',
                sort=True,
                **kwargs):
    # embeds_file_paths = pathex.get_sorted_files_from_all_sub_dirs__(embeds_path, full_path=True)

    # gx.write_all_lines(path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}_files.txt'), embeds_file_paths)
    # text_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.txt')
    # index_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.idx')

    embeds_list, _ = load_embeds(embeds_path=embeds_path,
                                 format=embeds_format,
                                 sort=sort,
                                 **kwargs)

    tic('Initializing index ...')
    if not num_clusters:
        num_clusters = len(embeds_list) // 100
    index = faiss.index_factory(embeds_list[0].shape[-1],
                                f"IVF{num_clusters},Flat",
                                faiss.METRIC_INNER_PRODUCT)
    if use_gpu:
        index = faiss.index_cpu_to_all_gpus(index)

    tic('Concatenating embeddings ...')
    if 0 < train_ratio < 1:
        gx.hprint_message(
            f"will sample subset for training with ratio {train_ratio}...")

    all_embeds = np.concatenate(embeds_list if train_ratio == 1 else list(
        gx.sampled_iter(embeds_list, train_ratio)))
    toc(msg=f'Initialization done!')

    tic(f'Training embeddings of shape {all_embeds.shape} ...')
    index.train(all_embeds)
    if use_gpu:
        index = faiss.index_gpu_to_cpu(index)
    toc(msg='Index training done!')

    tic('Add embeddings to index ...')
    del all_embeds
    embed_index_start = 0

    for embeds in tqdm(embeds_list):
        embed_count = embeds.shape[0]
        index.add_with_ids(
            embeds,
            np.arange(embed_index_start, embed_index_start + embed_count))
        embed_index_start += embed_count

    # with open(text_file_path, 'w+') as wf:
    #     for embeds, batch in embeds_iter(embeds_file_paths=embeds_file_paths, embeds_key=embeds_key, sample_file=sample_file, sample_ratio=train_ratio, embeds_idx=embeds_idx, use_tqdm=True, yield_batch=True):
    #         write_all_lines_to_stream(wf=wf, iterable=batch[embeds_txt_key], use_tqdm=False)
    #         embed_count = embeds.shape[0]
    #         index.add_with_ids(embeds, np.arange(embed_index_start, embed_index_start + embed_count))
    #         embed_index_start += embed_count

    if path.exists(output_path):
        os.remove(output_path)
    gx.hprint_message('saving indexed embeddings to', output_path)
    faiss.write_index(index, output_path)
    toc(msg='Indexing done!')
    return index
示例#8
0
文件: data_util.py 项目: XinliYu/utix
def train_test_val_split_for_files(file_paths: List,
                                   train_test_val_ratios: Tuple[float, float,
                                                                float],
                                   output_path: Union[str, Tuple[str, str,
                                                                 str]],
                                   copy_files=True,
                                   overwrite=False,
                                   sort=False,
                                   shuffle=True,
                                   rnd_seed=-1,
                                   verbose=__debug__,
                                   num_p=1):
    if verbose:
        tic(f"Splitting {len(file_paths)} files into train/test/val sets with split ratios {train_test_val_ratios}",
            newline=True)
    if len(train_test_val_ratios) != 3:
        raise ValueError(
            f"must specify three ratios for the train/test/validation set splits; got {len(train_test_val_ratios)} ratios '{','.join((str(x) for x in train_test_val_ratios))}'"
        )
    if sort:
        file_paths.sort()
    elif shuffle:
        with numpy_local_seed(rnd_seed) as _:
            if rnd_seed >= 0:
                file_paths.sort()  # NOTE reproducibility needs this sort
            np.random.shuffle(file_paths)

    if isinstance(output_path, str):
        train_dir = path.join(output_path, 'train')
        test_dir = path.join(output_path, 'test')
        val_dir = path.join(output_path, 'val')
    elif len(output_path) == 3:
        train_dir, test_dir, val_dir = output_path
    else:
        raise ValueError(
            msg_invalid_arg_value(arg_val=output_path, arg_name='output_path'))

    ensure_sum_to_one_arg(arg_val=train_test_val_ratios,
                          arg_name='train_test_val_ratios',
                          warning=True)
    paex.ensure_dir_existence(train_dir, clear_dir=overwrite, verbose=verbose)
    paex.ensure_dir_existence(test_dir, clear_dir=overwrite, verbose=verbose)
    paex.ensure_dir_existence(val_dir, clear_dir=overwrite, verbose=verbose)
    splits = split_list_by_ratios(list_to_split=file_paths,
                                  split_ratios=train_test_val_ratios,
                                  check_ratio_sum_to_one=False)
    for cur_path_list, cur_output_dir in zip(splits,
                                             (train_dir, test_dir, val_dir)):
        if copy_files:
            batch_copy(
                src_paths=cur_path_list,
                dst_dir=cur_output_dir,
                solve_conflict=True,
                use_tqdm=verbose,
                tqdm_msg=f"copy files to {path.basename(cur_output_dir)}"
                if verbose else None,
                num_p=num_p)
        else:
            batch_move(
                src_paths=cur_path_list,
                dst_dir=cur_output_dir,
                solve_conflict=True,
                undo_move_on_failure=verbose,
                use_tqdm=True,
                tqdm_msg=f"move files to {path.basename(cur_output_dir)}"
                if verbose else None)
    if verbose:
        toc()
示例#9
0
文件: mp_ext.py 项目: XinliYu/utix
def get_mp_cache_files(num_p,
                       file_paths,
                       sort=True,
                       verbose=__debug__,
                       cache_dir_path=None,
                       chunk_size=100000,
                       sort_use_basename=False,
                       rebuild_on_change=True):
    if isinstance(file_paths, str):
        file_paths = [file_paths]
    else:
        file_paths = paex.sort_paths(file_paths,
                                     sort=sort,
                                     sort_by_basename=sort_use_basename)

    num_file_paths = len(file_paths)
    if verbose:
        hprint_pairs(('number of files', num_file_paths), ('num_p', num_p))
    if num_file_paths < num_p:
        if cache_dir_path is None:
            if len(file_paths) == 1:
                cache_dir_path = paex.add_to_main_name(file_paths[0],
                                                       prefix='.mp.')
            else:
                cache_dir_path = path.join(path.dirname(file_paths[0]), '.mp')
        cache_file_ext_name = paex.get_ext_name(file_paths[0])

        tic('Constructs multi-processing cache files at path ' +
            path.join(cache_dir_path, '*' + cache_file_ext_name))

        mp_cache_file_paths = None
        files_id_path = cache_dir_path + '.id'
        if path.exists(cache_dir_path):
            if path.exists(files_id_path):
                old_files_id = ioex.read_all_text(files_id_path).strip()
                new_files_id = ioex.get_files_id(
                    file_paths
                )  # the file paths are already sorted above, so the files_id would be the same for the same files if they are not changed
                if new_files_id != old_files_id:
                    hprint_message(f'Files are changed; rebuilding cache at',
                                   cache_dir_path)
                    import shutil, os
                    shutil.rmtree(cache_dir_path)  # removes file cache
                    os.remove(files_id_path)  # removes the id file
                else:
                    mp_cache_file_paths = paex.get_files_by_pattern(
                        dir_or_dirs=cache_dir_path,
                        pattern='*' + cache_file_ext_name,
                        full_path=True,
                        recursive=False,
                        sort=sort,
                        sort_use_basename=sort_use_basename)
                    if not mp_cache_file_paths:
                        wprint_message(
                            'Cache directory exists, but nothing there',
                            cache_dir_path)
            else:
                hprint_message(f'Files id does not exist; rebuilding cache at',
                               cache_dir_path)
                import shutil
                shutil.rmtree(cache_dir_path)  # removes file cache
        if not mp_cache_file_paths:
            ioex.write_all_text(ioex.get_files_id(file_paths), files_id_path)
            ioex.write_all_lines(
                iterable=ioex.iter_all_lines_from_all_files(file_paths),
                output_path=cache_dir_path,
                create_dir=True,
                chunk_size=chunk_size,
                chunked_file_ext_name=cache_file_ext_name)
            mp_cache_file_paths = paex.get_files_by_pattern(
                dir_or_dirs=cache_dir_path,
                pattern='*' + cache_file_ext_name,
                full_path=True,
                recursive=False,
                sort=sort,
                sort_use_basename=sort_use_basename)

        if mp_cache_file_paths:
            hprint_message(title='number of multi-processing cache files',
                           content=len(mp_cache_file_paths))
        else:
            raise IOError('multi-processing cache files are not found')
        file_paths = mp_cache_file_paths
        num_p = min(num_p, len(file_paths))
        toc('Done!')
    return num_p, file_paths