def cloud_main(self, file_count):
     with open('encrypted_vectors/feature_vectors.json') as data_file:
         feature_loaded = json.load(data_file)
     with open('encrypted_indices/indices.json') as data_file:
         indices_loaded = json.load(data_file)
     with open('encrypted_query/query_vectors.json') as data_file:
         query_loaded = json.load(data_file)
     d = Decrypt()
     feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded))
     indices = d.decrypt_indices_vector(bytes(indices_loaded))
     query_vectors = d.decrypt_indices_vector(bytes(query_loaded))
     feature_vectors = np.frombuffer(feature_vectors, dtype=int)
     feature_vectors = np.reshape(feature_vectors, (file_count, -1))
     indices = json.loads(indices.decode())
     query_vectors = np.frombuffer(query_vectors, dtype=int)
     query_vectors = np.reshape(query_vectors, (-1, 6))
     print(feature_vectors.shape, indices, query_vectors.shape)
     l = LSH(feature_vectors, indices)
     n_neighbors, result = l.query(query_vectors, 6, 45)
     print(n_neighbors)
     cursor.execute(sql_select_Query)
     records = cursor.fetchall()
     for row in records:
         if row[0] - 1 in result:
             image_name = row[1]
             print(image_name)
             CloudAPISender().cloud_api_sender(image_name)
     # Closing the connection
     conn.close()
예제 #2
0
파일: nn.py 프로젝트: mikaelbaymani/nn
def query(fname, key='key', topk=10, truncate=80):

    model = pickle.load(open(CONST.MODEL, 'rb'))

    dataframe = pd.read_csv(CONST.DATASET)
    corpus = TfidfVectorizer().fit_transform(dataframe['content'])

    lsh = LSH(corpus, model)
    index = dataframe[dataframe[key].apply(str) == str(fname)].index[0]

    dataframe['content'] = dataframe['content'].str[:int(truncate)]
    return lsh.query(corpus[index, :], int(topk),
                     10)[0].join(dataframe,
                                 on='id').sort_values('distance').iloc[:, 1:]
예제 #3
0
 def cloud_main(self, file_count):
     #establishing the connection
     conn = mysql.connector.connect(user='******',
                                    password='******',
                                    host='127.0.0.1',
                                    database='ImageRetrieval')
     #Creating a cursor object using the cursor() method
     cursor = conn.cursor()
     # Preparing SQL query to select a record from the database.
     sql_select_Query = "select * from images"
     with open('encrypted_vectors/feature_vectors.json') as data_file:
         feature_loaded = json.load(data_file)
     with open('encrypted_indices/indices.json') as data_file:
         indices_loaded = json.load(data_file)
     with open('encrypted_query/query_vectors.json') as data_file:
         query_loaded = json.load(data_file)
     d = Decrypt()
     feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded))
     indices = d.decrypt_indices_vector(bytes(indices_loaded))
     query_vectors = d.decrypt_indices_vector(bytes(query_loaded))
     feature_vectors = np.frombuffer(feature_vectors, dtype=int)
     feature_vectors = np.reshape(feature_vectors, (file_count, -1))
     indices = json.loads(indices.decode())
     query_vectors = np.frombuffer(query_vectors, dtype=int)
     query_vectors = np.reshape(query_vectors, (-1, 6))
     print(feature_vectors.shape, indices, query_vectors.shape)
     l = LSH(feature_vectors, indices)
     n_neighbors, result = l.query(query_vectors, 6, 45)
     print(n_neighbors)
     cursor.execute(sql_select_Query)
     records = cursor.fetchall()
     for row in records:
         if row[0] - 1 in result:
             image_name = row[1]
             print(image_name)
             CloudAPISender().cloud_api_sender(image_name)
     # Closing the connection
     conn.close()
예제 #4
0
vect_length = tfidf.vect_length  # length of the input vector
num_hashtables = 1  # number of iterations
digest_length = 0
print 'perform lsh'
lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables)
for i, k in enumerate(tfidf._id_list):
    vect = tfidf.get_vector(i)
    lsh.index(vect, extra_data=tfidf._id_list[i])
''' Query documents '''
dedup = set()
keys = lsh.hash_tables[0].keys()
i = 0
for key in keys:
    bucket = lsh.hash_tables[0].get_val(key)
    for query_object in bucket:
        candidates = lsh.query(query_object[0], distance_func='cosine')
        for c in candidates:
            candidate_key = c[0][
                1]  # warc id is appended as extra data in lsh.index()
            if candidate_key == query_object[1]:
                continue
            if str(query_object[1]) <= str(candidate_key):
                candidate_distance = c[1]
                if float(candidate_distance) >= threshold:
                    dedup.add((query_object[1], candidate_key,
                               candidate_distance, 'false'))
                else:
                    dedup.add((query_object[1], candidate_key,
                               candidate_distance, 'true'))
            sys.stdout.write("\rDoing thing %i" % i)
            sys.stdout.flush()
예제 #5
0
class DND:
    MAX_SIZE = 25
    TM = 0.1

    def __init__(self, N, D, K, L):
        self.lsh = LSH(SimHash(D, K, L), K, L)
        self.keys = np.zeros((N, D), dtype=np.float32)
        self.values = np.zeros((N, 1), dtype=np.float32)
        self.lru = np.zeros(N, dtype=np.float32)
        self.key2idx = dict()

        self.size = 0
        self.max_memory = N
        self.K = K
        self.L = L

    def __contains__(self, key):
        return tuple(key) in self.key2idx

    def __getitem__(self, key):
        try:
            index = self.key2idx[tuple(key)]
            self.lru[index] += DND.TM
            return self.values[index]
        except:
            return None

    def __setitem__(self, key, value):
        item = tuple(key)
        try:
            # 1) Find memory index for key vector
            index = self.key2idx[item]
        except:
            # 2) Add key vector if not present
            if self.size >= self.max_memory:
                # 3) If memory is full, select LRU memory index and remove from LSH hash tables
                index = np.argmin(self.lru)
                self.lsh.erase(self.keys[index], index)
            else:
                index = self.size
                self.size += 1

            # Rehash key into LSH hash tables
            self.lsh.insert(key, index)
            self.key2idx[item] = index

            # Add new key to memory
            self.keys[index] = key
        finally:
            # Update memory value
            self.values[index] = value
            self.lru[index] += DND.TM

    def retrieve(self, query):
        # Collect memory indices from LSH hash tables
        indices, cL = self.lsh.query(query.data, DND.MAX_SIZE)

        # Gather keys and values from memory
        keys = self.keys[indices]
        values = self.values[indices]
        self.lru[indices] += DND.TM

        assert (keys.shape[0] == values.shape[0])
        return keys, values, indices, cL
예제 #6
0
          num_hashtables=num_hashtables)
for i, k in enumerate(tfidf._id_list):
    vect = tfidf.get_vector(i)
    lsh.index(vect, extra_data=tfidf._id_list[i])
''' Query documents '''
log += str(time.asctime(time.localtime(time.time()))) + '\n'
log += 'query documents\n'
print time.asctime(time.localtime(time.time()))
print 'Query documents'
distance_func = "cosine"

corr = set()

for i, key in enumerate(tfidf._id_list):
    query_object = tfidf.get_vector(i)
    candidates = lsh.query(query_object, distance_func=distance_func)
    for c in candidates:
        candidate_key = c[0][
            1]  # warc id is appended as extra data in lsh.index()
        candidate_distance = c[1]
        if str(key) < str(candidate_key):
            candidate = (key, candidate_key, candidate_distance)
            corr.add(candidate)

corr_list = list(corr)
corr_list = sorted(corr_list, key=lambda x: x[2])

f = open('correspondences/' + domain + '_sim.txt', 'w')
for t in corr_list:
    f.write(t[0] + ' ' + t[1] + ' ' + str(t[2]) + '\n')
f.close()
예제 #7
0
파일: test_lsh.py 프로젝트: pdoyle5000/lsh
class TestLsh(TestCase):
    """TODO: Test Case docstring goes here."""
    def setUp(self):
        self.lsh = LSH(3, 2, 1)
        self.lsh_two_tables = LSH(3, 2, 2)

        # Overwrite randomly initalized planes with known values.
        self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])]
        self.lsh_two_tables.planes = [
            np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]),
            np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]),
        ]

    def test_hashing(self):
        vector_ones = [1, 1]
        # This will add each plane without a scalar.
        # each value greater than zero will append a 1 to the string, 0 otherwise.
        self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_ones), "100")

        vector_twos = [-2, 2]
        self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_twos), "101")

    def test_table_indexing(self):
        self.lsh.index([1, 1], "data1")
        self.lsh.index([-2, 2], "data2")
        self.assertDictEqual(self.lsh.hash_tables[0], {
            "100": [([1, 1], "data1")],
            "101": [([-2, 2], "data2")]
        })

        self.lsh_two_tables.index([1, 1], "data1")
        self.lsh_two_tables.index([-2, 2], "data2")
        self.assertDictEqual(
            self.lsh_two_tables.hash_tables[0],
            {
                "100": [([1, 1], "data1")],
                "101": [([-2, 2], "data2")]
            },
        )
        self.assertDictEqual(
            self.lsh_two_tables.hash_tables[1],
            {
                "010": [([1, 1], "data1")],
                "011": [([-2, 2], "data2")]
            },
        )

    def test_query(self):
        self.lsh.index([1, 1], "data1")
        self.lsh.index([-2, 2], "data2")
        output = self.lsh.query([1, 1], 1)
        self.assertEqual(output, ["data1"])

        self.lsh_two_tables.index([1, 1], "data1")
        self.lsh_two_tables.index([-2, 2], "data2")
        output = self.lsh_two_tables.query([1, 1], 1)
        self.assertEqual(output, ["data1"])

        self.lsh_two_tables.index([-1, -1], "data3")
        self.lsh_two_tables.index([6, 6], "data4")
        self.lsh_two_tables.index([-10, -10], "data5")
        output = self.lsh_two_tables.query([6, 6], 2)
        self.assertEqual(output, ["data4", "data1"])
예제 #8
0

if __name__ == "__main__":
    data, q = load_data()

    # Brute Force
    nn_brute, nn_brute_dist = brute_force_nn(q, data)

    # Ball Tree
    bt = BallTree(data, 10)
    nn_bt = bt.query_top_down(q)

    # LSH
    hash_fn_gen = lambda: guassian_hash_generator(150, data.shape[1])
    lsh = LSH(data, hash_fn_gen, 1, 10)
    nn_lsh, performace_limit = lsh.query(q)

    # Ball tree LSH
    print('balltree lsh performance limit', performace_limit)
    lsh = LSH(data, hash_fn_gen, 1, 3)
    bt_lsh = BallTreeLSH(bt, lsh)
    nn_bt_lsh = bt_lsh.query(q, performance_limit=performace_limit)

    # compare_results
    _, nn_lsh_to_q = brute_force_nn(nn_lsh, q)
    _, nn_bt_to_q = brute_force_nn(nn_bt, q)
    _, nn_bt_lsh_to_q = brute_force_nn(nn_bt_lsh, q)

    #########
    # Stats #
    #########
예제 #9
0
def train(device, data, schedule, mi_type, args):
    model = MI_Estimator(device, D=d, ED=ed, HD=256)
    model.to(device)
    model.train()

    optimizer = optim.Adam(model.parameters(), lr=5e-4)

    xs, ys = data
    xs = xs.to(device)
    ys = ys.to(device)

    lsh = LSH(SimHash(ed, K, L), K, L)

    estimates = []
    avg_estimate = []

    id_set = set()
    n_iters = num_iterations * batch_size
    for batch_idx in range(n_iters):
        iteration = batch_idx // batch_size
        MI = schedule[iteration]

        t = 10 if batch_idx <= 1000 else 100
        if batch_idx % t == 0:
            build(lsh, model, xs)

        optimizer.zero_grad()

        y = ys[batch_idx:batch_idx + 1]
        ey = model.embed_y(y)

        id_list = lsh.query(ey)
        id_set = id_set.union(set(id_list))
        indices = torch.LongTensor(id_list).to(device)

        nx = F.embedding(indices, xs)
        px = xs[batch_idx:batch_idx + 1]
        x = torch.cat([px, nx], dim=0)
        x = torch.unsqueeze(x, dim=0)

        mi = model(x, y, args)
        loss = -mi
        loss.backward()
        optimizer.step()

        avg_estimate.append(mi.item())
        if (batch_idx + 1) % 100 == 0:
            '''
            asim = model.cosine_similarity(x, y)
            true = torch.mean(torch.diag(asim))
            neye = 1. - torch.eye(batch_size).to(device)
            noise = torch.sum(torch.mul(asim, neye)).item() / (batch_size * (batch_size-1))
            print("MI:{} true: {:.4f}, noise: {:.4f}".format(MI, true, noise))
            '''
            avg_mi = sum(avg_estimate) / float(len(avg_estimate))
            print('{} {} MI:{}, E_MI: {:.6f}'.format(mi_type.name,
                                                     batch_idx + 1, MI,
                                                     avg_mi))
            sys.stdout.flush()

        if (batch_idx + 1) % wsize == 0:
            print(len(id_set), len(id_set) // wsize)
            id_set.clear()
            avg_mi = sum(avg_estimate) / float(len(avg_estimate))
            estimates.append(avg_mi)
            avg_estimate.clear()
    lsh.stats()
    return estimates