class Hasher: def __init__(self, **options): self.signer = MinHashSignature(options.get("numHashes",100)) self.hasher = LSH(options.get("numHashes",100),options.get("numItemsInBand",10), None) self.save_min_hash = options.get("saveMinhashes",False) print 'in hashing' print options.get("numHashes",100),options.get("numsItemsInBand",10),options.get("saveMinhashes",False) def perform(self,rdd): rdd = rdd.map(lambda (x,y) : (x,json.dumps(y))) return self.compute_hashes(rdd) def compute_hashes(self, data): return data.flatMap(lambda (x, y): self.compute_row_lsh(x, y)) def compute_row_lsh(self, key, row): if len(row) > 0: #print "Sign:", row min_hash_sig = self.signer.sign(row) #print min_hash_sig if min_hash_sig is not None: lsh_sig = list(self.hasher.hash(min_hash_sig)) if self.save_min_hash is False: min_hash_sig = None for lsh_val in lsh_sig: yield lsh_val, (key, min_hash_sig)
class Hasher: def __init__(self, num_hashes, num_items_in_band, save_min_hash): self.signer = MinHashSignature(num_hashes) self.hasher = LSH(num_hashes, num_items_in_band, None) self.save_min_hash = save_min_hash pass def compute_hashes(self, data): return data.flatMap(lambda (x, y): self.compute_row_lsh(x, y)) def compute_row_lsh(self, key, row): if len(row) > 0: #print "Sign:", row min_hash_sig = self.signer.sign(row) if min_hash_sig is not None: lsh_sig = list(self.hasher.hash(min_hash_sig)) if self.save_min_hash is False: min_hash_sig = None for lsh_val in lsh_sig: yield lsh_val, (key, min_hash_sig)
def __init__(self, num_hashes, num_items_in_band, save_min_hash): self.signer = MinHashSignature(num_hashes) self.hasher = LSH(num_hashes, num_items_in_band, None) self.save_min_hash = save_min_hash pass
def __init__(self, **options): self.signer = MinHashSignature(options.get("numHashes",100)) self.hasher = LSH(options.get("numHashes",100),options.get("numItemsInBand",10), None) self.save_min_hash = options.get("saveMinhashes",False) print 'in hashing' print options.get("numHashes",100),options.get("numsItemsInBand",10),options.get("saveMinhashes",False)
] images = [] for imgNames in imageNames: images.append(getImageData(imgNames)) alphaIgnoredImages = [] for img in images: imgWithoutAlpha = img[:,:,0:3] # print("imgWithoutAlpha", imgWithoutAlpha.shape) alphaIgnoredImages.append(imgWithoutAlpha) # print("images", images, images[0].shape, images[1].shape, images[2].shape) # print("alphaIgnoredImages", alphaIgnoredImages) reshapedImages = [] for img in alphaIgnoredImages: reshapedImg = img.reshape(1,-1) reshapedImages.append(reshapedImg) print("reshapedImages", reshapedImages, "dimension", reshapedImages[0].shape[1]) lshModel = LSH(noOfHashers=25, noOfHash=10, dimension=reshapedImages[0].shape[1]) for i in range(0, len(reshapedImages)): lshModel.train(reshapedImages[i], { "name": imageNames[i] }) print(lshModel.isSimilar(reshapedImages[0], reshapedImages[1])) print(lshModel.isSimilar(reshapedImages[0], reshapedImages[2])) print(lshModel.isSimilar(reshapedImages[1], reshapedImages[2])) print(lshModel.isSimilar(reshapedImages[2], reshapedImages[3]))
from sklearn.feature_extraction.text import CountVectorizer from lsh.lsh import LSH import numpy as np texts = [ 'Jack went to the market to buy some fruits', 'Jane went to the market to buy some fruits today', 'Robert and his team played hockey today' ] vectorizer = CountVectorizer() X = vectorizer.fit_transform(texts).toarray().reshape(len(texts), 1, -1) lshModel = LSH(noOfHashers=25, noOfHash=3, dimension=X.shape[2]) for i in range(0, X.shape[0]): lshModel.train(X[i], {"name": texts[i]}) print(lshModel.isSimilar(X[0], X[1])) print(lshModel.isSimilar(X[0], X[2])) print(lshModel.isSimilar(X[1], X[2]))