Exemplo n.º 1
0
    def swap_index(self, location):
        logging.debug("Swapping index to FAISS")

        faiss_location = location + "/h_faiss"

        # init faiss index
        self.index = hfaiss.Faiss(faiss_location)

        # train faiss index
        self.index.init_faiss(self.training_data)

        # migrate data
        for idx_ in range(int(self.KV_store.get(byt(-1)))):
            value = self.KV_store.get(byt(idx_))

            if value:
                cid_len_ = int(value[:2]) + 2

                self.index.add_vectors([{
                    "_id":
                    int(idx_),
                    "code":
                    CID.bson2doc(value[cid_len_:])["code"]
                }])

        # set active index
        self.active_index = INDEX_LABEL[1]
Exemplo n.º 2
0
    def test_2_db_exist_create(self):
        schema_def1 = {
            "description": "this is my database",
            "unique": "r8and0mseEd90",
            "encoder": "example.com/autoencoder/API",
            "codelen": 3,
            "metadata": {
                "name": "string",
                "age": "number"
            }
        }
        schema_def2 = {
            "description": "this is my database",
            "unique": "r8and0mseEd90",
            "encoder": "example.com/autoencoder/API",
            "codelen": 3,
            "metadata": {
                "name": "string",
                "age": "number"
            }
        }
        database_name = router.create_database(schema_def1)

        schema_def = schema.generate_schema(schema_def2)
        database_name_ = CID.doc2CID(schema_def)

        self.assertEqual(database_name, database_name_,
                         "DB name doesn't match")
Exemplo n.º 3
0
    def get_nearest(self, qmatrix, k, rad):
        ids = []
        dists = []

        qmatrix = self.resize_matrix(qmatrix,
                                     int(os.environ["FIXED_VEC_DIMENSION"]))

        # radius defined,
        if rad is not None:
            if k is not None:
                ids, dists = self.index.get_nearest_rad(qmatrix, rad)
            else:
                ids, dists = self.index.get_nearest_rad(qmatrix, rad)[:k]
        else:
            ids, dists = self.index.get_nearest_k(qmatrix, k)

        # get docs
        for idx_, idb in enumerate(ids):
            for idx__, id_ in enumerate(idb):
                value = self.KV_store.get(byt(id_))
                if value:
                    cid_len_ = int(value[:2]) + 2
                    ids[idx_][idx__] = CID.bson2doc(value[cid_len_:])
                else:
                    ids[idx_][idx__] = None

        return ids, dists
Exemplo n.º 4
0
    def test_1_auth_create_db(self):
        # deploy app
        index.server.start()

        schema_def = {
            "description": "this is my database",
            "unique": "r8and0mseEd905",
            "encoder": "example.com/autoencoder/API",
            "codelen": 30,
            "metadata": {
                "name": "string",
                "age": "number"
            }
        }
        data_ = {"schema": schema_def}
        data_bson = bson.dumps(data_)
        # generate hash
        hash = SHA384.new()
        hash.update(data_bson)

        # Sign with pvt key
        signer = pkcs1_15.new(priv_key)
        signature = signer.sign(hash)
        signature = base58.b58encode(signature).decode("utf-8")

        url = "http://127.0.0.1:5001/db/create"

        headers = CaseInsensitiveDict()
        headers["Content-Type"] = "application/json"

        data = {"data": data_, "signature": signature}

        data = json.dumps(data)

        resp = requests.post(url, headers=headers, data=data)

        database_name_ = resp.json()["database_name"]

        schema_def = schema.generate_schema(schema_def)
        database_name = CID.doc2CID(schema_def)

        index.server.terminate()
        index.server.join()

        self.assertEqual(database_name, database_name_,
                         "DB name doesn't match")
Exemplo n.º 5
0
    def __init__(self, json_schema):
        # get database name from schema CID
        database_name = CID.doc2CID(json_schema)

        # keep database name
        self.database_name = database_name

        # set DB disk location
        self.DB_disk_location = STORE_LOCATION + database_name

        # create data directory for database
        if not os.path.exists(self.DB_disk_location):
            os.makedirs(self.DB_disk_location)

        # keep schema in store location
        with open(self.DB_disk_location + '/schema.json', 'w') as oschema:
            json.dump(json_schema, oschema)

        # get vector index
        self.active_index = INDEX_LABEL[0]
        self.index = self.get_index(self.DB_disk_location)

        # Create KV store instance
        self.KV_store = plyvel.DB(self.DB_disk_location + "/kv.db",
                                  create_if_missing=True)
        if self.KV_store.get(byt(-1)) == None:
            self.KV_store.put(byt(-1), byt(0))

        # Training data holder
        self.training_data = []
        self.TD_location = self.DB_disk_location + "/TD"
        # Try loading training data
        self.load_TD_from_disk()

        # spawn worker thread
        self.q_maxsize = MAX_Q_LEN
        self.process_flag = True
        self.process_timeout_sec = PROCESS_TIMEOUT
        self.spawn()
Exemplo n.º 6
0
def create_database(json_schema):
    """
    Create a database from a given valid JSON schema
    """
    # TBD: write ahead logging (INIT)

    # generate proper schema definition from templete schema
    json_schema = schema.generate_schema(json_schema)

    # identify invalid schema template
    if json_schema == None:
        return None

    # Check if database already exists
    database_name = CID.doc2CID(json_schema)
    if databases.get(database_name):
        # return database name
        logging.debug("Database already exists")
        return database_name

    # If database doesn't exist already,
    # then create one
    manager_h = manager.VecManager(json_schema)

    database_name = manager_h.database_name

    validator_fn = schema.compile(json_schema)
    databases[database_name] = {
        "manager_h": manager_h,
        "schema": {
            "json": json_schema,
            "validator": validator_fn
        }
    }

    # TBD: save schema to storage
    # TBD: write ahead logging (END)

    return database_name
Exemplo n.º 7
0
def insert_docs(docs, database_name):
    """
    Insert a set of valid documents to database
    """

    # write ahead log (INIT)

    cids_ = []
    docs_ = []

    # get manager_h for database_name
    database_h = load_database(database_name)
    # invalid database name
    if not database_h:
        logging.debug("Database doesn't exist. Please create one.")
        return cids_

    # validate docs against schema
    # and add CID

    for doc_ in docs:
        if schema.validate_json_docs(database_h["schema"]["validator"], doc_):
            CID_ = CID.doc2CID(doc_)
            cids_.append(CID_)
            doc_["CID"] = CID_
            docs_.append(doc_)
        else:
            cids_.append(None)

    # get manager_h for database_name
    manager_h = database_h["manager_h"]

    manager_h.add_vectors(docs_)

    # write ahead log (END)

    return cids_
Exemplo n.º 8
0
    def add_vectors(self, documents):
        # add to KV store
        next_index = int(self.KV_store.get(byt(-1)))

        # check if it is ready to swap index
        if next_index > TRAIN_DAT_LEN and self.active_index == INDEX_LABEL[0] \
            and len(self.training_data) >= TRAIN_DAT_LEN:
            # swap index
            self.swap_index(self.DB_disk_location)

        # init batch write to DB
        wb_ = self.KV_store.write_batch()
        for idx_, doc_ in enumerate(documents):
            cid_ = byt(doc_["CID"])
            # resize "code"
            doc_["code"] = self.resize_vector(
                doc_["code"], int(os.environ["FIXED_VEC_DIMENSION"]))
            # cod_ = pickle.dumps(cod_)
            documents[idx_]["_id"] = next_index

            # TBD: convert to bulk insert
            wb_.put(byt(next_index),
                    byt(len(cid_)) + cid_ + CID.doc2bson(doc_))
            wb_.put(cid_, byt(next_index))

            next_index += 1

        wb_.put(byt(-1), byt(next_index))
        # commit DB write
        wb_.write()

        # push to training data
        self.update_training_data(documents)

        # add vectors to index
        return self.index.add_vectors(documents)