Пример #1
0
    def _set_mips_index(self):
        """
        Create a Faiss Flat index with inner product as the metric
        to search against
        """
        try:
            import faiss
        except ImportError:
            raise Exception(
                "Error: Please install faiss to use FaissMIPSIndex")

        if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
            print("\n> Building index", flush=True)

        cpu_index = faiss.IndexFlatIP(self.embed_size)

        if self.use_gpu:
            # create resources and config for GpuIndex
            config = faiss.GpuMultipleClonerOptions()
            config.shard = True
            config.useFloat16 = True
            gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=config)
            self.mips_index = faiss.IndexIDMap(gpu_index)
            if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
                print(">> Initialized index on GPU", flush=True)
        else:
            # CPU index supports IDs so wrap with IDMap
            self.mips_index = faiss.IndexIDMap(cpu_index)
            if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
                print(">> Initialized index on CPU", flush=True)

        # if we were constructed with a BlockData, then automatically load it
        # when the FAISS structure is built
        if self.embed_data is not None:
            self.add_embed_data(self.embed_data)
Пример #2
0
    def build_index(self):
        if self.text_embedding_path.endswith(".gz"):
            f = gzip.open(self.text_embedding_path, mode='rt')
        else:
            f = open(self.text_embedding_path)

        ids = []
        vectors = []
        for line in f:
            vals = line.split('\t')
            if vals[0].startswith('Q'):
                qnode = vals[0]
                # use the number part of Qnodes as id
                id = int(qnode[1:])
                if vals[1] == 'embedding_sentence':
                    self.qnode_to_sentence_dict[qnode] = vals[2]
                if vals[1] == 'text_embedding':
                    x = vals[2].strip().split(',')
                    x = [np.float32(r) for r in x]
                    self.qnode_to_vector_dict[qnode] = np.array([x])
                    ids.append(id)
                    vectors.append(x)
                    index = faiss.IndexFlatL2(len(x))
                    if self.index is None:
                        self.index = faiss.IndexIDMap(index)

        self.index.add_with_ids(np.array(vectors), np.array(ids))
Пример #3
0
    def save_faiss_index(self):
        try:
            diary_cover_pic_face_vec_fd = open(
                "./diary_cover_pic_face_vec.txt", "r")

            xb, ids = [], []
            for line in diary_cover_pic_face_vec_fd.readlines():
                line_term_list = line.split("\t")
                diary_id = line_term_list[0]
                face_feature = json.loads(line_term_list[1])
                face_feature_vec = np.array(face_feature)
                xb.append(face_feature_vec)
                ids.append(diary_id)

            xb_np = np.array(xb).astype('float32')
            ids_np = np.array(ids).astype('int')
            index = faiss.IndexHNSWFlat(128, 32)
            index = faiss.IndexIDMap(index)
            index.add_with_ids(xb_np, ids_np)
            faiss.write_index(index, settings.INDEX_PATH)

            diary_cover_pic_face_vec_fd.close()
        except:
            logging.error("catch exception, err_msg:%s" %
                          traceback.format_exc())
Пример #4
0
def load_from_db(index_file, version_id):
    global feature_api
    log.info('load_from_db')
    VECTOR_SIZE = 2048

    if index_file is None:
        log.debug('Create a new index file')
        index = faiss.IndexFlatL2(VECTOR_SIZE)
        index2 = faiss.IndexIDMap(index)
    else:
        log.debug('Load from index file')
        index2 = faiss.read_index(index_file)

    offset = 0
    limit = 100
    id_num = 1

    file = os.path.join(os.getcwd(), INDEX_FILE)

    i = 0
    try:
        while True:
            queue_size = rconn.llen(REDIS_OBJECT_INDEX_QUEUE)
            if queue_size != 0:
                time.sleep(60)
                continue

            res = feature_api.get_features(offset=offset, limit=limit)

            if len(res) == 0:
                save_index_file(file)
                time.sleep(60 * 60)
                continue

            objects = []
            for obj in res:
                feature = np.fromstring(obj['vector'], dtype=np.float32)
                xb = np.expand_dims(np.array(feature, dtype=np.float32),
                                    axis=0)
                id_array = []
                id_array.append(id_num)
                id_set = np.array(id_array)
                index2.add_with_ids(xb, id_set)

                new_obj = {}
                new_obj['object_id'] = obj['object_id']
                new_obj['index'] = id_num
                objects.append(new_obj)
                id_num = id_num + 1

            save_objects_to_db(objects)
            faiss.write_index(index2, file)
            if i % 100 == 0:
                save_index_file(file)

            offset = offset + limit
            i = i + 1

    except Exception as e:
        log.error(str(e))
Пример #5
0
    def __init__(self, epsilon_b: float, epsilon_n: float, lam: int, beta: float,
                 alpha: float, max_age: int, r0: float,
                 dimensions: int = 2, random_state: int = 42) -> None:

        self.graph = Graph()

        self.epsilon_b = epsilon_b
        self.epsilon_n = epsilon_n
        self.lam = lam
        self.beta = beta
        self.alpha = alpha
        self.max_age = max_age
        self.dimensions = dimensions
        self.r0 = r0

        self.index = faiss.IndexIDMap(faiss.IndexFlatL2(dimensions))

        self.next_id = 2
        self.point_to_cluster = {}

        self.cycle = 0
        self.step = 1

        np.random.seed(random_state)

        node_1 = Node(np.random.rand(1, dimensions).astype(
            'float32')[0], 0, id=0, error_cycle=0, radius=r0)
        node_2 = Node(np.random.rand(1, dimensions).astype(
            'float32')[0], 0, id=1, error_cycle=0, radius=r0)

        self.graph.insert_node(node_1)
        self.graph.insert_node(node_2)

        self.index.add_with_ids(
            np.array([node_1.protype, node_2.protype]), np.array([0, 1]))
Пример #6
0
 def buildindex(self):
     try:
         if self.em is None:
             print("No imported encoded text database.")
             dec = input(
                 "Would you like to encode? (it may take ~ 1 hour)\n (y/n): "
             )
             if dec.lower()[0] == 'y':
                 self.encoder.max_seq_length = 512
                 self.em = self.encoder.encode(
                     self.df[self.target].to_list(), show_progress_bar=True)
                 self.em = np.array([emi
                                     for emi in self.em]).astype("float32")
                 self.vecdim = self.em.shape[1]
             else:
                 path = input("Enter the path to encoded text base: ")
                 self.importencoded(path)
         #self.index = faiss.IndexFlatL2(self.vecdim)
         self.index = faiss.IndexFlatIP(self.vecdim)
         self.index = faiss.IndexIDMap(self.index)
         self.normalizeencoded()
         self.index.add_with_ids(self.em, self.df.id.values)
         print("FAISS index was built successfully")
         print("Number of articles:", self.index.ntotal)
     except:
         print("ERROR: CANNOT build index")
Пример #7
0
    def __init__(self,
                 target,
                 nprobe=128,
                 num_gpu=None,
                 index_factory_str=None,
                 verbose=False,
                 mode='proxy',
                 using_gpu=True):
        self._res_list = []

        found_gpu = len(os.environ['CUDA_VISIBLE_DEVICES'].split(","))
        if found_gpu == 0:
            raise RuntimeError(
                "No GPU found. Please export CUDA_VISIBLE_DEVICES")
        if num_gpu is None or num_gpu > found_gpu:
            num_gpu = found_gpu
        print('[faiss gpu] #GPU: {}'.format(num_gpu))

        size, dim = target.shape
        assert size > 0, "size: {}".format(size)
        index_factory_str = "IVF{},PQ{}".format(
            min(8192, 16 * round(np.sqrt(size))),
            32) if index_factory_str is None else index_factory_str
        cpu_index = faiss.index_factory(dim, index_factory_str)
        cpu_index.nprobe = nprobe

        if mode == 'proxy':
            co = faiss.GpuClonerOptions()
            co.useFloat16 = True
            co.usePrecomputed = False

            index = faiss.IndexProxy()
            for i in range(num_gpu):
                res = faiss.StandardGpuResources()
                self._res_list.append(res)
                sub_index = faiss.index_cpu_to_gpu(
                    res, i, cpu_index, co) if using_gpu else cpu_index
                index.addIndex(sub_index)
        elif mode == 'shard':
            raise NotImplementedError
        else:
            raise KeyError("Unknown index mode")

        index = faiss.IndexIDMap(index)
        index.verbose = verbose

        # get nlist to decide how many samples used for training
        nlist = int([
            item for item in index_factory_str.split(",") if 'IVF' in item
        ][0].replace("IVF", ""))

        # training
        if not index.is_trained:
            indexes_sample_for_train = np.random.randint(0, size, nlist * 256)
            index.train(target[indexes_sample_for_train])

        # add with ids
        target_ids = np.arange(0, size)
        index.add_with_ids(target, target_ids)
        self.index = index
Пример #8
0
def ivf_searsh_2():
    '''
    自定义索引
    :return:
    '''
    nlist = 4  # 行分割数-行分割成的单元格数,分割数越多分割越耗时
    index = faiss.IndexFlatL2(d)
    iv_index = faiss.IndexIVFFlat(index, d, nlist, faiss.METRIC_L2)

    indexIdMap = faiss.IndexIDMap(iv_index)
    idx = [int(str(int(time.time() * 1000)) + str(i)) for i in range(nb)]
    # print(xb)
    # print(xb.size)
    # print(np.array(idx))
    if not indexIdMap.is_trained:
        indexIdMap.train(xb)
    indexIdMap.add_with_ids(xb, np.array(idx))  # 索引并行是 数字,long int

    indexIdMap.nprobe = 3  # 搜索访问的单元格数,访问越大越精确越耗时 :既 128维倒排切割成4份,只计算3份的维度相似

    start = time.time()
    D, I = indexIdMap.search(xq, K)
    print('ivf_search_time', time.time() - start)

    print('ivf_search_相似索引', I)
    print('ivf_search_相似值', D)
Пример #9
0
    def _build_faiss_model(self):
        sample = next(self._descriptor_set.iterdescriptors())
        sample_v = sample.vector()
        n, d = self.count(), sample_v.size

        data = np.empty((n, d), dtype=np.float32)
        elements_to_matrix(
            self._descriptor_set,
            mat=data,
            use_multiprocessing=self.use_multiprocessing,
            report_interval=1.0,
        )
        self._uuids = np.array(list(self._descriptor_set.keys()))
        self.faiss_flat = faiss.IndexFlatL2(d)

        if self.exhaustive:
            self._faiss_index = faiss.IndexIDMap(self.faiss_flat)
        else:
            nlist = 10000
            self._faiss_index = faiss.IndexIVFFlat(self.faiss_flat, d, nlist,
                                                   faiss.METRIC_L2)
            self._faiss_index.train(data)
            self._faiss_index.nprobe = 5000

        self._log.info("data shape, type: %s, %s", data.shape, data.dtype)
        self._log.info("uuid shape, type: %s, %s", self._uuids.shape,
                       self._uuids.dtype)
        self._faiss_index.add_with_ids(data, self._uuids)

        self._log.info("FAISS index has been constructed with %d vectors",
                       self._faiss_index.ntotal)
Пример #10
0
 def init_sentence_index(self):
     self.logger.info("Initializing sentence index")
     empty_embedding = np.array([self.embedder.encode("Vole")]).astype("float32")
     self.sentences_list = []
     print(empty_embedding.shape[1])
     self.sentence_index = faiss.IndexFlatL2(empty_embedding.shape[1])
     self.sentence_index = faiss.IndexIDMap(self.sentence_index)
Пример #11
0
def faiss_flat_ip(encoded_data):
    """Faiss flatip."""
    dim = encoded_data.shape[1]
    index = faiss.IndexIDMap(faiss.IndexFlatIP(dim))
    faiss.normalize_L2(encoded_data)
    index.add_with_ids(encoded_data, np.arange(len(encoded_data)))
    return index
Пример #12
0
def load_from_queue(index_file):
    log.info('load_from_queue')
    VECTOR_SIZE = 2048

    if index_file is None:
        log.debug('Create a new index file')
        index = faiss.IndexFlatL2(VECTOR_SIZE)
        index2 = faiss.IndexIDMap(index)
    else:
        log.debug('Load from index file')
        index2 = faiss.read_index(index_file)

    def items():
        while True:
            yield rconn.blpop([REDIS_OBJECT_FEATURE_QUEUE])

    def request_stop(signum, frame):
        log.info('stopping')
        rconn.connection_pool.disconnect()
        log.info('connection closed')
        sys.exit()

    signal.signal(signal.SIGINT, request_stop)
    signal.signal(signal.SIGTERM, request_stop)

    i = 0
    for item in items():
        key, obj_data = item
        obj = pickle.loads(obj_data)
        # log.debug(obj)

        feature = obj['feature']
        xb = np.expand_dims(np.array(feature, dtype=np.float32), axis=0)
        obj['feature'] = None
        rconn.rpush(REDIS_OBJECT_LIST, obj['name'])
        d = pickle.dumps(obj)
        rconn.hset(REDIS_OBJECT_HASH, obj['name'], obj['product_id'])

        # xb = np.array(features)
        id_num = rconn.llen(REDIS_OBJECT_LIST)
        # log.debug(id_num)
        id_array = []
        id_array.append(id_num)
        id_set = np.array(id_array)
        # print(xb)
        # print(np.shape(xb))
        # print(id_set)
        # print(xb.shape)
        # print(id_set.shape)
        # print(id_set)
        start_time = time.time()
        index2.add_with_ids(xb, id_set)
        elapsed_time = time.time() - start_time
        log.info('indexing time: ' + str(elapsed_time))
        file = os.path.join(os.getcwd(), INDEX_FILE)
        if i % 50 == 0:
            faiss.write_index(index2, file)
            save_index_file(file)
        i = i + 1
        log.info('index done')
Пример #13
0
 def test_IDMap(self):
     sub_index = faiss.IndexFlatL2(d)
     index = faiss.IndexIDMap(sub_index)
     index.add_with_ids(xb, np.arange(len(xb)))
     del sub_index
     gc.collect()
     index.add_with_ids(xb, np.arange(len(xb)))
def get_persona_faiss_selected(args, tokenizer):
    """ Prepare the dataset for training and evaluation """
    personachat = get_dataset_with_no_tokenizer(tokenizer, args.dataset_path,
                                                args.dataset_cache)

    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    persona_faiss_selected = []
    history_faiss_selected = []
    persona_faiss_index = []
    history_faiss_index = []
    persona_complete = parse_data('./Dataset/train_self_original.txt')
    persona_complete = persona_complete[:20]
    for dataset_name, dataset in personachat.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if args.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(args.num_candidates, num_candidates)
        for dialog in dataset:
            #persona = dialog["personality"].copy()
            persona = dialog["persona_info"]
            #persona2 = dialog["persona_info2"].copy()
            #persona_selected = faiss(replyanddialog)
            #index: all persona1 sentences or all personalities
            #model1 = SentenceTransformer('bert-large-nli-mean-tokens')
            #model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased')
            model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

            embeddings_persona = model.encode(persona_complete,
                                              show_progress_bar=True)
            #data_train list of set of list of all personalities (not duplicated)
            # Step 1: Change data type
            #embeddings_persona = np.array([embedding for embedding in embeddings_persona]).astype("float32")

            # Step 2: Instantiate the index
            index = faiss.IndexFlatL2(embeddings_persona.shape[1])

            # Step 3: Pass the index to IndexIDMap
            index = faiss.IndexIDMap(index)

            # Step 4: Add vectors and their IDs
            index.add_with_ids(
                embeddings_persona,
                np.array(list(range(0, embeddings_persona.shape[0]))))

            for _ in range(args.personality_permutations):
                for utterance in dialog["utterances"]:
                    history = utterance["history"][-(2 * args.max_history +
                                                     1):]
                    for j, candidate in enumerate(
                            utterance["candidates"][-num_candidates:]):
                        history_encoded = model.encode(history,
                                                       show_progress_bar=True)
                        D, I = index.search(np.array(history_encoded), k=5)
                        history_faiss_selected.append(history)
                        persona_faiss_selected.append(
                            persona_complete[I[0][1]])

                #persona = [persona[-1]] + persona[:-1]  # permuted personalities
    return persona_faiss_selected
Пример #15
0
 def fit(self, item_matrix, ids):
     num, vec_dim = item_matrix.shape
     # 创建索引
     self.faiss_index = faiss.IndexFlatL2(vec_dim)  # 使用欧式距离作为变量
     # 添加id编号
     self.faiss_index1 = faiss.IndexIDMap(self.faiss_index)
     # 添加数据
     self.faiss_index1.add_with_ids(item_matrix, ids)
Пример #16
0
    def __init__(self, batch_size, name, data_location, read_from_file, path=FAISS_PATH, model='document_embeddings',
                 pooling='mean', index_size=INDEX_SIZE, index_start=0, index_number=0,
                 fail_mode=0, fail_size=0, previous_time=0., failed_list=None):
        # make all arguments class fields
        if not failed_list:
            self.failed_list = []
        else:
            with open(failed_list, 'r') as f:
                self.failed_list = [tuple(map(int, line.split(' '))) for line in f]
        self.model = model
        self.pooling = pooling
        self.fail_size = fail_size
        self.fail_mode = fail_mode
        self.index_number = index_number
        self.index_start = index_start
        self.index_position = index_start  # index start is still needed, that's why it's not increased
        self.index_size = index_size
        self.path = path
        self.read_from_file = read_from_file
        if batch_size:
            self.batch_size = batch_size
        self.name = name
        self.data_location = data_location
        self.previous_time = previous_time
        if not self.path.endswith(f'/{name}'):  #
            self.path += f'/{name}'
        # if index_number is 0, it was started from the user, not from the script itself.
        if self.index_number == 0 and not self.fail_mode:
            try:
                os.mkdir(self.path)
            except FileExistsError:
                print(f'directory already exists and I am just deleting it.')
                shutil.rmtree(self.path)
                os.mkdir(self.path)
            # write basic information into index information File
            self.write_index_information(f'--- Index Information for Testcase {self.name} ---')
            self.write_index_information(f'Dataset: {self.data_location}')
            self.write_index_information(f'Model: {self.model}')
            self.write_index_information(f'Batch Size : {self.batch_size}')
        #  read in the dataset. The dataset is always read completely and spliced afterwards
        self.document_pairs = get_dataset(data_location, read_from_file, write=f'{name}_dataset')

        # creation of the index and the ID-Map which adds the index
        # If I change the behavior to include a fail-mode and pick up, from it, read in the current index
        if self.fail_mode:
            self.id_index = faiss.read_index(f'{path}/{name}_{index_number}')
        else:
            if not self.model == 'distiluse-base-multilingual-cased':
                self.index = faiss.IndexFlatIP(768)  # Metric InnerProduct
            else:
                self.index = faiss.IndexFlatIP(512)  # Metric InnerProduct
            self.id_index = faiss.IndexIDMap(self.index)
        if not failed_list:
            self.failed_list = []
        else:
            with open(failed_list, 'r') as f:
                self.failed_list = [tuple(map(int, line.split(' '))) for line in f]
Пример #17
0
 def build_index(self):
     """:returns an inverted index for the search documents"""
     vectors = [self.encode(document) for document in self.documents]
     index = faiss.IndexIDMap(
         faiss.IndexFlatIP(768))  # dimensionality of vector space
     # Add document vectors into index after transforming into numpy arrays. IDs should match len(documents)
     index.add_with_ids(np.array([vec.numpy() for vec in vectors]),
                        np.array(range(0, len(self.documents))))
     return index
Пример #18
0
 def __init__(self, index_file_path=None, id_dict_path=None, dim=128, index_types='Flat'):
     if index_file_path and id_dict_path:
         default_logger.info('loading index from %s' % index_file_path)
         self.index = faiss.read_index(index_file_path, 0)
         self.id2key = pickle.load(open(id_dict_path, 'rb'))
     else:
         self.index = faiss.index_factory(dim, index_types)
         self.index = faiss.IndexIDMap(self.index)
         self.id2key = {}
     default_logger.info('index inited, is_trained=%s' % (self.index.is_trained))
Пример #19
0
 def __build_index(self, index_dimension):
     if self.index_type is IndexType.L2_INDEX:
         log.debug("Building L2 index")
         index = faiss.IndexFlatL2(index_dimension)
     elif self.index_type is IndexType.COSINE_INDEX:
         log.debug("Building cosine index")
         index = faiss.IndexFlatIP(index_dimension)
     else:
         raise ValueError(f"Unknown index type {self.index_type}")
     self.__index = faiss.IndexIDMap(index)
 def make_faiss_index_idmap(self, n_dimensions):
     """
     Make a fairly general-purpose FAISS index
     :param n_dimensions:
     :return:
     """
     print("Making index ...")
     tmp_index = faiss.IndexFlatL2(n_dimensions)
     index = faiss.IndexIDMap(tmp_index)
     return index
    def create_naas_faiss_index(self):
        intent_df = pd.read_pickle('data/awesome-notebooks.pkl').reset_index()
        db_ids = intent_df["intent_id"].values

        for prefix, dimension in zip(['tf', 'st'], [512, 384]):
            db_vectors = np.stack(
                intent_df[f"{prefix}_embedding"].values).astype(np.float32)
            faiss.normalize_L2(db_vectors)
            intent_index = faiss.IndexIDMap(faiss.IndexFlatIP(dimension))
            intent_index.add_with_ids(db_vectors, db_ids)
            faiss.write_index(intent_index,
                              f"data/{prefix}_naas_intent_index.idx")
Пример #22
0
    def test_shards(self):
        k = 32
        ref_index = faiss.IndexFlatL2(d)

        print('ref search')
        ref_index.add(xb)
        _Dref, Iref = ref_index.search(xq, k)
        print(Iref[:5, :6])

        shard_index = faiss.IndexShards(d)
        shard_index_2 = faiss.IndexShards(d, True, False)

        ni = 3
        for i in range(ni):
            i0 = int(i * nb / ni)
            i1 = int((i + 1) * nb / ni)
            index = faiss.IndexFlatL2(d)
            index.add(xb[i0:i1])
            shard_index.add_shard(index)

            index_2 = faiss.IndexFlatL2(d)
            irm = faiss.IndexIDMap(index_2)
            shard_index_2.add_shard(irm)

        # test parallel add
        shard_index_2.verbose = True
        shard_index_2.add(xb)

        for test_no in range(3):
            with_threads = test_no == 1

            print('shard search test_no = %d' % test_no)
            if with_threads:
                remember_nt = faiss.omp_get_max_threads()
                faiss.omp_set_num_threads(1)
                shard_index.threaded = True
            else:
                shard_index.threaded = False

            if test_no != 2:
                _D, I = shard_index.search(xq, k)
            else:
                _D, I = shard_index_2.search(xq, k)

            print(I[:5, :6])

            if with_threads:
                faiss.omp_set_num_threads(remember_nt)

            ndiff = (I != Iref).sum()

            print('%d / %d differences' % (ndiff, nq * k))
            assert (ndiff < nq * k / 1000.)
Пример #23
0
 def __init__(self, 
              index_name: str,
              embeddin_dim: int,
              embedding_type: str):
     
     self.index_name = index_name
     self.last_id = 0
     self.index_map = dict()
     self.encoder = encoder
     self.embeddin_dim = embeddin_dim
     self.embeddin_type = embedding_type
     self.index: faiss.IndexIDMap = faiss.IndexIDMap(faiss.IndexFlatIP(embeddin_dim))
Пример #24
0
    def build_index(self, model, texts, embedding_dim, model_type, batch_size, from_saved=None):
        if from_saved is not None:
            with open(from_saved, 'rb') as f:
                vectors = pickle.load(f)
        else:
            vectors = calculate_embeddings(model, texts, embedding_dim, model_type=model_type, batch_size=batch_size)

        faiss.normalize_L2(vectors)
        index = faiss.IndexIDMap(faiss.IndexFlatIP(embedding_dim))
        index.add_with_ids(vectors, np.array(range(0, vectors.shape[0])))

        return index
Пример #25
0
 def get_results(self):
     embed = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
     # Compute sentence embeddings for every text n the documents
     corpus = [d['text'] for d in self.documents]
     corpus_embeddings = np.array(embed.encode(corpus, convert_to_tensor=True))
     index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
     index.add_with_ids(corpus_embeddings, np.array(range(0, len(corpus))))
     # Write index for future usage
     faiss.write_index(index, 'pandemics')
     encoded_query = embed.encode([self.query])
     top_k = index.search(encoded_query, self.k)
     answers = [corpus[_id] for _id in top_k[1].tolist()[0]]
     return answers
Пример #26
0
    def __init__(self, color_cursor):
        self.index = faiss.IndexIDMap(faiss.IndexFlatL2(3 * 6))

        # Query all the cover image palette
        self.id_to_arr = {
            row[0]: np.array(json.loads(row[1])).flatten()
            for row in color_cursor
        }

        # Build the index
        arr = np.stack(list(self.id_to_arr.values())).astype('float32')
        ids = np.array(list(self.id_to_arr.keys())).astype('int')
        self.index.add_with_ids(arr, ids)
Пример #27
0
def gen_faiss(s3,
              ssapp_docs,
              s3_bucket_name,
              paper_all,
              model,
              win_size: int = 3,
              max_words: int = 100):
    """
    gen faiss index at initialization
    Args:
        ssapp_docs: bucket
        paper_all: All papers in database
        model: sentence_bert model
        win_size: sliding window, default is 3
        max_words: max words per segment, default is 300

    Returns:
        None

    """
    # remove all the html in

    # gen the faiss indexs
    paper_titles = set([i.title for i in paper_all])
    faiss_indexs = None
    # if database has data, load the data into faiss_indexs
    if paper_all:
        paper_ids = np.array([i.id for i in paper_all])
        paper_embeddings = np.array(
            [i.e1 + i.e2 + i.e3 + i.e4 for i in paper_all]).astype("float32")
        faiss_indexs = faiss.IndexFlatIP(paper_embeddings.shape[1])
        faiss_indexs = faiss.IndexIDMap(faiss_indexs)
        faiss_indexs.add_with_ids(paper_embeddings, paper_ids)
    for file in ssapp_docs.objects.filter(Prefix='docs/'):
        file_key = file.key
        if file_key.split('.')[-1] in cf.ALLOWED_EXTENSIONS:
            # if file name is not legal, rename aws s3 file name
            # legal_name has the format: docs/...
            legal_key = file_key.encode('utf-8', 'ignore').decode('utf-8')
            if legal_key != file_key:
                s3.Object(s3_bucket_name, legal_key).copy_from(
                    CopySource=f'{s3_bucket_name}/{file_key}')
                s3.Object(s3_bucket_name, file_key).delete()
            legal_name = legal_key[5:]
            if legal_name not in paper_titles:
                # write to db
                body = BytesIO(file.get()['Body'].read())
                faiss_indexs = write_to_db(legal_name, body, model, win_size,
                                           max_words, faiss_indexs)
                write_to_html(legal_name, body, s3, s3_bucket_name)
    return faiss_indexs
Пример #28
0
def navicode_init():
    print("\nInitializing model . . .")

    embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

    cur_dir = os.getcwd()

    python_files = []

    for dirpath, _, files in os.walk(cur_dir):
        for filename in files:
            fname = os.path.join(dirpath, filename)
            if fname.endswith('.py'):
                python_files.append(fname)

    print(f"\nFound {len(python_files)} python sources\n")

    if len(python_files) > 0:
        dirname = os.path.basename(cur_dir)

        navi_dir = os.path.join(cur_dir, ".navi")
        if not os.path.exists(navi_dir):
            os.mkdir(navi_dir)

        corpus = []

        comments_dump = {}
        for i, python_file in enumerate(python_files):
            print(f"[{i+1}/{len(python_files)}] Scanning {python_file}")
            comments = comment_parser(python_file)
            filename = python_file[python_file.index(dirname):]
            for comment in comments:
                comments_dump[len(corpus)] = str(filename) + '---' + str(
                    comment[1]) + "---" + re.sub(r'[^a-zA-Z0-9]+', ' ',
                                                 comment[0])

                corpus.append(re.sub(r'[^a-zA-Z0-9]+', ' ', comment[0]))

        print(
            f"\nComputing comment embeddings for {len(corpus)} comments . . .")

        corpus_embeddings = embedder.encode(corpus, show_progress_bar=True)

        print("\nIndexing comment embeddings . . .")

        index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
        index.add_with_ids(corpus_embeddings, np.array(range(0, len(corpus))))
        faiss.write_index(index, os.path.join(navi_dir, dirname + '_navi'))

        with open(os.path.join(navi_dir, dirname + '_navi.json'), 'w') as file:
            json.dump(comments_dump, file, indent=4)
Пример #29
0
    def test_int64(self):
        # see https://github.com/facebookresearch/faiss/issues/1529
        v = faiss.Int64Vector()

        for i in range(10):
            v.push_back(i)
        a = faiss.vector_to_array(v)
        assert a.dtype == 'int64'
        np.testing.assert_array_equal(a, np.arange(10, dtype='int64'))

        # check if it works in an IDMap
        idx = faiss.IndexIDMap(faiss.IndexFlatL2(32))
        idx.add_with_ids(
            np.random.rand(10, 32).astype('float32'),
            np.random.randint(1000, size=10, dtype='int64'))
        faiss.vector_to_array(idx.id_map)
Пример #30
0
    def test_stress(self):
        # a mixture of the above, from issue #631
        target = np.random.rand(50, 16).astype('float32')

        index = faiss.IndexReplicas()
        size, dim = target.shape
        num_gpu = 4
        for _i in range(num_gpu):
            config = faiss.GpuIndexFlatConfig()
            config.device = 0   # simulate on a single GPU
            sub_index = faiss.GpuIndexFlatIP(faiss.StandardGpuResources(), dim, config)
            index.addIndex(sub_index)

        index = faiss.IndexIDMap(index)
        ids = np.arange(size)
        index.add_with_ids(target, ids)