def test_noninteger_per_band(self): try: lsh.LSH(per_band="foo") except: self.assert_(True) else: self.fail("tried to set per_band to a noninteger value.")
def test_too_few_bands(self): try: lsh.LSH(bands=-10) except: self.assert_(True) else: self.fail("bands was set to -10, should have failed.")
def test_too_few_per_band(self): try: lsh.LSH(per_band=-10) except: self.assert_(True) else: self.fail("per_band was set to -10, should have failed.")
def test_cache_is_usable(self): self.dump_data("small_correct") l = self.create_trained_model("small_correct", 21) l2 = lsh.LSH(assignment_name="small_correct") try: l2.load_cached_data() self.assertTrue(True) except: self.assertFalse(False)
def __init__(self, L=1, k=1): self.fs = 44100.0 self.pmax = 96 self.pmin = 36 self.cqtsz = self.pmax - self.pmin self.segsz = 20 self.M = self.cqtsz * self.segsz self.fftsz = 2**15 self.hop = int(0.1 * self.fs) self.db = lsh.LSH(L, k, self.M)
# print(test3) # print(os.getcwd()) import sys sys.path.append(test3) import query test_result = query.get_query_vec( "This is a long string. It used to test the fast query and vectorisation parts. and chek" "if it works well") print(test_result) print(type(test_result)) file = "../data_vectors/matrix_model.npy" parsed_files_dir = "../data_vectors/parsed_data" # create instance of LSH class lsh_test = lsh.LSH() # table parameters dataset = np.load(file) number_of_tables = 50 euclidean = True # create table LSHtable = lsh_test.LSHtable(dataset, euclidean, number_of_tables) # query parameters # query = dataset[0] # this can be get from Daniel query = test_result k = 10 # try k=100 # print("test===============================", dataset[0]) # run query and return query run time
def image2sim(input_images, prefix='image'): PATH = exp_config.get('data', 'path') IDENTICAL_T = eval(exp_config.get('predicate_image', 'identical_threshold')) method = exp_config.get('predicate_image', 'method') embedding_iters = eval(exp_config.get('cosine_embedding', 'n_iter')) assert method in [ 'identical', 'vgg16', 'vgg19', 'xception', 'inception_resnet_v2', 'vggface' ] print('input_images', len(input_images)) if os.path.isfile(PATH + prefix + '_list' + '.txt'): images = list() fin = open(PATH + prefix + '_list' + '.txt', 'r') for line in fin: images.append(line[:-1]) fin.close() else: images = list() for image in input_images: if image is not None: images.append(image) fout = open(PATH + prefix + '_list' + '.txt', 'w') for image in images: fout.write(image) fout.write('\n') fout.close() if method == 'identical': if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'): sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method + '.npz') else: funs = [ imagehash.average_hash, imagehash.phash, imagehash.dhash, imagehash.whash ] im_objs = list() for image in images: im_objs.append(Image.open(PATH + image)) print('images', len(images), 'im_objs', len(im_objs)) vs = list() for i in xrange(len(im_objs)): obj_i = im_objs[i] v_i = np.array([fun(obj_i) for fun in funs]) vs.append(v_i) sim = dok_matrix((len(images), len(images)), dtype=np.float32) for i in xrange(len(images)): current_t = time.time() v_i = vs[i] for j in xrange(len(images)): v_j = vs[j] s = np.median(v_i - v_j) if s < IDENTICAL_T: sim[i, j] = (IDENTICAL_T - s) / IDENTICAL_T print('processing images ', i, 100 * i // len(images), time.time() - current_t, 's') sim = sim.asformat('csr') scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz', sim) if method in [ 'vgg16', 'vgg19', 'xception', 'inception_resnet_v2', 'vggface' ]: if method == 'vgg16': from keras.applications.vgg16 import VGG16 from keras.preprocessing import image as keras_image from keras.applications.vgg16 import preprocess_input model = VGG16(weights='imagenet', include_top=False) if method == 'vgg19': from keras.applications.vgg19 import VGG19 from keras.preprocessing import image as keras_image from keras.applications.vgg19 import preprocess_input model = VGG19(weights='imagenet', include_top=False) if method == 'xception': from keras.applications.xception import Xception from keras.preprocessing import image as keras_image from keras.applications.xception import preprocess_input model = Xception(weights='imagenet', include_top=False) if method == 'inception_resnet_v2': from keras.applications.inception_resnet_v2 import InceptionResNetV2 from keras.preprocessing import image as keras_image from keras.applications.inception_resnet_v2 import preprocess_input model = InceptionResNetV2(weights='imagenet', include_top=False) if method == 'vggface': print('vggface') from keras_vggface.vggface import VGGFace from keras.preprocessing import image as keras_image from keras_vggface.utils import preprocess_input model = VGGFace(include_top=False) def get_feature(img_path): img = keras_image.load_img(img_path, target_size=(224, 224)) x = keras_image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) feature = model.predict(x) return feature if os.path.isfile(PATH + prefix + '_embeddings_' + method + '.npy'): embeddings = np.load(PATH + prefix + '_embeddings_' + method + '.npy') else: print('get image features') #debug embeddings = list() for image in images: embeddings.append(get_feature(PATH + image).flatten()) print('process', image) embeddings = np.array(embeddings, dtype=np.float32) np.save(PATH + prefix + '_embeddings_' + method + '.npy', embeddings) if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'): sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method + '.npz') else: lsh_instance = lsh.LSH(8, 5) indices = lsh_instance.load(embeddings) sim = dok_matrix((len(images), len(images)), dtype=np.float32) for i in range(len(images)): v_i = embeddings[i] for j in lsh_instance.query(indices[i]): v_j = embeddings[j] sim[i, j] = similarity(v_i, v_j) sys.stdout.write("\r%d%%" % (100 * i // len(images))) sys.stdout.flush() sim = sim.asformat('csr') scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz', sim) image2eid = dict(zip(images, range(len(images)))) image2eid[None] = len(images) return image2eid, sim
def test_data_is_not_there(self): l = lsh.LSH(assignment_name="data_not_there") self.assertFalse(l.is_cached(), \ "LSH tells us there's data here, but there shouldn't be.")
def create_trained_model(self, assignment_name, dims): l = lsh.LSH(assignment_name=assignment_name) data = self.retrieve_test_data(assignment_name) l.bin_data(data, dims=21) return l
def test_vanilla_instantiation(self): try: l = lsh.LSH() except: self.fail() self.assertTrue(True)
def test_cache_is_not_usable(self): # Need to find a better Exception to raise. l = lsh.LSH(assignment_name="broken_cache") self.assertRaises(KeyError, l.load_cached_data)
def setUp(self): self.lsh = lsh.LSH(n_features=10)
'My lifestyle is funky, and I want to be heard. Here is to thirsty encounters, or something. #powerpile #randomtweet', 'My diet is a personal wish, and I want to fly away. For all detailed glamour, forever. #thiswad #randomtweet', 'My favorite team is paid for, and I want to slow down. More sick lessons, so say we all. #megababe #randomtweet' ] # create a lot of duplicates tweets_with_duplicates = [] [tweets_with_duplicates.extend(tweets) for _ in range(500)] ## get vectors from tweets using sklearn vecs = TfidfVectorizer(stop_words='english').fit_transform(tweets_with_duplicates).toarray() ## create lsh num_tables = 20 hash_size = 13 bucket_size = 100000 input_dimension = vecs.shape[1] sann = lsh.LSH(num_tables, hash_size, input_dimension, bucket_size) ## simulate streaming nearest neighbor search for id, vec in enumerate(vecs): sann[vec] = id # append tweet to the index ids = sann[vec] # get the ids of the nearest vectors # get next tweet next_id = ids[0] next_tweet = tweets_with_duplicates[next_id] # pretty print current tweet and its nearest neighbor print(f'\n\033[1mNearest neighbor of\033[0m\n{tweets_with_duplicates[id]}\n\033[1mis\033[0m\n{next_tweet}')
import bayes as bayes import nn as nn import lsh as lsh import projections as RP if __name__ == '__main__': print('Welcome to the world of high and low dimensions!') # The entire code should be able to run from this file! import argparse parser = argparse.ArgumentParser(description='Create a ArcHydro schema') parser.add_argument('--datafile', required=True, help='path to datafile') parser.add_argument('--labelfile', required=True, help='path to labelfile') parser.add_argument('--dataset', required=True, help='dataset name: dolphins / pubmed / twitter') args = parser.parse_args() if(args.dataset != 'dolphins' and args.dataset != 'pubmed' and args.dataset != 'twitter'): print("The entered dataset name is incorrect. Please enter dataset name (dophins / pubmed / twitter)") else: [Accuracy, F1macro, F1micro] = bayes.bayesClassifier(args.datafile,args.labelfile,args.dataset) [Accuracy, F1macro, F1micro] = nn.NearestNeighbor(args.datafile,args.labelfile,args.dataset) [Accuracy, F1macro, F1micro] = lsh.LSH(args.datafile,args.labelfile,args.dataset) #RP.RandomProjection(args.datafile,args.labelfile,args.dataset)