예제 #1
0
	def test_noninteger_per_band(self):
		try:
			lsh.LSH(per_band="foo")
		except:
			self.assert_(True)
		else:
			self.fail("tried to set per_band to a noninteger value.")
예제 #2
0
	def test_too_few_bands(self):
		try:
			lsh.LSH(bands=-10)
		except:
			self.assert_(True)
		else:
			self.fail("bands was set to -10, should have failed.")
예제 #3
0
	def test_too_few_per_band(self):
		try:
			lsh.LSH(per_band=-10)
		except:
			self.assert_(True)
		else:
			self.fail("per_band was set to -10, should have failed.")
예제 #4
0
	def test_cache_is_usable(self):
		self.dump_data("small_correct")
		l = self.create_trained_model("small_correct", 21)
		l2 = lsh.LSH(assignment_name="small_correct")
		try:
			l2.load_cached_data()
			self.assertTrue(True)
		except:
			self.assertFalse(False)
예제 #5
0
 def __init__(self, L=1, k=1):
     self.fs = 44100.0
     self.pmax = 96
     self.pmin = 36
     self.cqtsz = self.pmax - self.pmin
     self.segsz = 20
     self.M = self.cqtsz * self.segsz
     self.fftsz = 2**15
     self.hop = int(0.1 * self.fs)
     self.db = lsh.LSH(L, k, self.M)
예제 #6
0
# print(test3)
# print(os.getcwd())
import sys
sys.path.append(test3)
import query

test_result = query.get_query_vec(
    "This is a long string. It used to test the fast query and vectorisation parts. and chek"
    "if it works well")
print(test_result)
print(type(test_result))

file = "../data_vectors/matrix_model.npy"
parsed_files_dir = "../data_vectors/parsed_data"
# create instance of LSH class
lsh_test = lsh.LSH()

# table parameters
dataset = np.load(file)
number_of_tables = 50
euclidean = True

# create table
LSHtable = lsh_test.LSHtable(dataset, euclidean, number_of_tables)

# query parameters
# query = dataset[0]  # this can be get from Daniel
query = test_result
k = 10  # try k=100
# print("test===============================", dataset[0])
# run query and return query run time
예제 #7
0
def image2sim(input_images, prefix='image'):

    PATH = exp_config.get('data', 'path')
    IDENTICAL_T = eval(exp_config.get('predicate_image',
                                      'identical_threshold'))
    method = exp_config.get('predicate_image', 'method')
    embedding_iters = eval(exp_config.get('cosine_embedding', 'n_iter'))

    assert method in [
        'identical', 'vgg16', 'vgg19', 'xception', 'inception_resnet_v2',
        'vggface'
    ]
    print('input_images', len(input_images))

    if os.path.isfile(PATH + prefix + '_list' + '.txt'):
        images = list()
        fin = open(PATH + prefix + '_list' + '.txt', 'r')
        for line in fin:
            images.append(line[:-1])
        fin.close()
    else:
        images = list()
        for image in input_images:
            if image is not None:
                images.append(image)
        fout = open(PATH + prefix + '_list' + '.txt', 'w')
        for image in images:
            fout.write(image)
            fout.write('\n')
        fout.close()

    if method == 'identical':
        if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'):
            sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method +
                                        '.npz')
        else:
            funs = [
                imagehash.average_hash, imagehash.phash, imagehash.dhash,
                imagehash.whash
            ]

            im_objs = list()
            for image in images:
                im_objs.append(Image.open(PATH + image))

            print('images', len(images), 'im_objs', len(im_objs))

            vs = list()
            for i in xrange(len(im_objs)):
                obj_i = im_objs[i]
                v_i = np.array([fun(obj_i) for fun in funs])
                vs.append(v_i)

            sim = dok_matrix((len(images), len(images)), dtype=np.float32)
            for i in xrange(len(images)):

                current_t = time.time()

                v_i = vs[i]
                for j in xrange(len(images)):
                    v_j = vs[j]
                    s = np.median(v_i - v_j)
                    if s < IDENTICAL_T:
                        sim[i, j] = (IDENTICAL_T - s) / IDENTICAL_T

                print('processing images ', i, 100 * i // len(images),
                      time.time() - current_t, 's')

            sim = sim.asformat('csr')
            scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz',
                                  sim)

    if method in [
            'vgg16', 'vgg19', 'xception', 'inception_resnet_v2', 'vggface'
    ]:
        if method == 'vgg16':
            from keras.applications.vgg16 import VGG16
            from keras.preprocessing import image as keras_image
            from keras.applications.vgg16 import preprocess_input
            model = VGG16(weights='imagenet', include_top=False)
        if method == 'vgg19':
            from keras.applications.vgg19 import VGG19
            from keras.preprocessing import image as keras_image
            from keras.applications.vgg19 import preprocess_input
            model = VGG19(weights='imagenet', include_top=False)
        if method == 'xception':
            from keras.applications.xception import Xception
            from keras.preprocessing import image as keras_image
            from keras.applications.xception import preprocess_input
            model = Xception(weights='imagenet', include_top=False)
        if method == 'inception_resnet_v2':
            from keras.applications.inception_resnet_v2 import InceptionResNetV2
            from keras.preprocessing import image as keras_image
            from keras.applications.inception_resnet_v2 import preprocess_input
            model = InceptionResNetV2(weights='imagenet', include_top=False)
        if method == 'vggface':
            print('vggface')
            from keras_vggface.vggface import VGGFace
            from keras.preprocessing import image as keras_image
            from keras_vggface.utils import preprocess_input
            model = VGGFace(include_top=False)

        def get_feature(img_path):
            img = keras_image.load_img(img_path, target_size=(224, 224))
            x = keras_image.img_to_array(img)
            x = np.expand_dims(x, axis=0)
            x = preprocess_input(x)
            feature = model.predict(x)
            return feature

        if os.path.isfile(PATH + prefix + '_embeddings_' + method + '.npy'):
            embeddings = np.load(PATH + prefix + '_embeddings_' + method +
                                 '.npy')
        else:
            print('get image features')  #debug
            embeddings = list()
            for image in images:
                embeddings.append(get_feature(PATH + image).flatten())
                print('process', image)
            embeddings = np.array(embeddings, dtype=np.float32)

            np.save(PATH + prefix + '_embeddings_' + method + '.npy',
                    embeddings)

        if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'):
            sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method +
                                        '.npz')
        else:
            lsh_instance = lsh.LSH(8, 5)
            indices = lsh_instance.load(embeddings)
            sim = dok_matrix((len(images), len(images)), dtype=np.float32)
            for i in range(len(images)):

                v_i = embeddings[i]
                for j in lsh_instance.query(indices[i]):
                    v_j = embeddings[j]
                    sim[i, j] = similarity(v_i, v_j)

                sys.stdout.write("\r%d%%" % (100 * i // len(images)))
                sys.stdout.flush()

            sim = sim.asformat('csr')
            scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz',
                                  sim)

    image2eid = dict(zip(images, range(len(images))))
    image2eid[None] = len(images)

    return image2eid, sim
예제 #8
0
	def test_data_is_not_there(self):
		l = lsh.LSH(assignment_name="data_not_there")
		self.assertFalse(l.is_cached(), \
			"LSH tells us there's data here, but there shouldn't be.")
예제 #9
0
	def create_trained_model(self, assignment_name, dims):
		l = lsh.LSH(assignment_name=assignment_name)
		data = self.retrieve_test_data(assignment_name)
		l.bin_data(data, dims=21)
		return l
예제 #10
0
	def test_vanilla_instantiation(self):
		try:
			l = lsh.LSH()
		except:
			self.fail()
		self.assertTrue(True)
예제 #11
0
	def test_cache_is_not_usable(self):
		# Need to find a better Exception to raise.
		l = lsh.LSH(assignment_name="broken_cache")
		self.assertRaises(KeyError, l.load_cached_data)
예제 #12
0
 def setUp(self):
     self.lsh = lsh.LSH(n_features=10)
예제 #13
0
        'My lifestyle is funky, and I want to be heard. Here is to thirsty encounters, or something. #powerpile #randomtweet',
        'My diet is a personal wish, and I want to fly away. For all detailed glamour, forever. #thiswad #randomtweet',
        'My favorite team is paid for, and I want to slow down. More sick lessons, so say we all. #megababe #randomtweet'
    ]
    # create a lot of duplicates
    tweets_with_duplicates = []
    [tweets_with_duplicates.extend(tweets) for _ in range(500)]

    ## get vectors from tweets using sklearn
    vecs = TfidfVectorizer(stop_words='english').fit_transform(tweets_with_duplicates).toarray()

    ## create lsh
    num_tables = 20
    hash_size = 13
    bucket_size = 100000
    input_dimension = vecs.shape[1]
    sann = lsh.LSH(num_tables, hash_size, input_dimension, bucket_size)

    ## simulate streaming nearest neighbor search
    for id, vec in enumerate(vecs):
        sann[vec] = id  # append tweet to the index
        ids = sann[vec]  # get the ids of the nearest vectors

        # get next tweet
        next_id = ids[0]
        next_tweet = tweets_with_duplicates[next_id]

        # pretty print current tweet and its nearest neighbor
        print(f'\n\033[1mNearest neighbor of\033[0m\n{tweets_with_duplicates[id]}\n\033[1mis\033[0m\n{next_tweet}')

예제 #14
0
import bayes as bayes
import nn as nn
import lsh as lsh
import projections as RP

if __name__ == '__main__':
    print('Welcome to the world of high and low dimensions!')
    # The entire code should be able to run from this file!
    import argparse
    
    parser = argparse.ArgumentParser(description='Create a ArcHydro schema')
    parser.add_argument('--datafile', required=True,
                        help='path to datafile')
    parser.add_argument('--labelfile', required=True,
                        help='path to labelfile')
    parser.add_argument('--dataset', required=True,
                        help='dataset name: dolphins / pubmed / twitter')
    args = parser.parse_args()
    if(args.dataset != 'dolphins' and args.dataset != 'pubmed' and args.dataset != 'twitter'):
        print("The entered dataset name is incorrect. Please enter dataset name (dophins / pubmed / twitter)")
    else:
        [Accuracy, F1macro, F1micro] = bayes.bayesClassifier(args.datafile,args.labelfile,args.dataset)
        [Accuracy, F1macro, F1micro] = nn.NearestNeighbor(args.datafile,args.labelfile,args.dataset)
        [Accuracy, F1macro, F1micro] = lsh.LSH(args.datafile,args.labelfile,args.dataset)
        #RP.RandomProjection(args.datafile,args.labelfile,args.dataset)