def estimateDistinctElements(items, num_perm): """This function will estimate the number of distinct elements in a list. The default number of hash function permutations is num_perm(128), but I asjusted after researching more- http://blog.cluster-text.com/tag/minhash/""" h = MinHash(num_perm) # creates a minhash object with the parameter for item in items: # being the number of hash permutations h.digest(sha1(item.encode('utf8'))) # digests the minhash signatures print("Estimated number of elements: ", h.count())
def query(self, v, n): m = MinHash(num_perm=1) for e in v: m.update(str(e).encode('utf-8')) print( self._annoy.get_nns_by_vector(m.digest().tolist(), n, self._search_k)) return self._annoy.get_nns_by_vector(m.digest().tolist(), n, self._search_k)
def estimateDistinctElementParallel(listOfItems, num_perm): """Same as above, except here we have a nested for loop to iterate through the lists in the list. This function will also append the estimation result to a list for use in the following accuracy function.""" h = MinHash(num_perm) for item in listOfItems: for i in item: # nested for loop to iterate over lists within a list h.digest(sha1(i.encode('utf8'))) estimate.append(h.count()) print("Estimated number of elements: ", h.count())
class VisualMinHashWithDataSketch: """ minHash with sketches for near image duplicate detection. This is an implementation of minHash algorithm introduced in Scalable Near Identical Image and Shot Detection - Microsoft (https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/civr2007.pdf) by Ondrej Chum, James Philbin, Michael Isard, Andrew Zisserman """ # TODO: add word weighting on this minHash algorithm. def __init__(self, minHash_hash_num=512, minHash_param_k=512, minHash_param_s=3, rand_seed=0): # We could use minHash function as permutation of vocabulary. # However, it is memory inefficient. As an alternative, we can use hash function and take min value among the existing members. # TODO: This alternative may not work. Check this out. from datasketch import MinHash # In paper, sec 4.1, it says they use 512 independent hash function and grouped 512 sketches by usning hash function multiple times. # I think this is not valid implementation, because sketches are not indenpendent anymore. # Maybe that was compromise between mathmatical accuracy and speed. Caluclating 512*3 hash function is 3 times slower. # To reproduce the paper results, I may have to follow this implementation. # But, let me try correct implemenation first, which makes 512 sketches to be truly independent. self.minHash_hash_num = minHash_hash_num # indenpendent hash function. self.minHash_param_k = minHash_param_k # number of sketches self.minHash_param_s = minHash_param_s # tuple length, or sketch size np.random.seed(rand_seed) self.sketch_choices = [] for k in range(minHash_param_k): rand_choice_hashfunc = [] for s in range(minHash_param_s): rand_choice_hashfunc.append( np.random.randint(0, minHash_hash_num)) # print('choice:', rand_choice_hashfunc) self.sketch_choices.append(rand_choice_hashfunc) self.minHash = MinHash(num_perm=minHash_hash_num, seed=rand_seed) def hash_bow(self, target_set): # init minHashes self.minHash.clear() for elem in target_set: self.minHash.update_with_intval(elem) hashval = self.minHash.digest() # print('hashval:', hashval) result = [] for choice_indexes in self.sketch_choices: # print('choice_indexes:', choice_indexes) sketch = hashval[choice_indexes] # print('sketch:', sketch) result.append(tuple(sketch)) return result
def FIG(self, tr): #wm = self.wmg.minhash(tr) # wm1 is of the type WeightedMinHash #vl=np.transpose(wm.hashvalues) #vl=vl[0] m = MinHash(num_perm=self.num_perm) for d in tr: m.update(d.encode('utf8')) return (m.digest())
def query(self, v, n): if self._metric == 'angular': v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] m = MinHash(num_perm=128) for e in v: m.update(str(e).encode('utf-8')) return self._lshf.kneighbors([m.digest()], return_distance=False, n_neighbors=n)[0]
def find_minhash(self, num_perm=128): """ Compute minhash, cached. """ words = self.words doc_hash = MinHash(num_perm=num_perm) for word, _ in words: doc_hash.update(word.encode('utf8')) return list(doc_hash.digest())
def fit(self, X): self.index = numpy.empty([0, 32]) self._index_minhash = [] self._ball_index = [] self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm=self._n_perm) for e in x: m.update(str(e).encode('utf-8')) self._index.add(str(i), m) #self.index.append(m.digest()) self.index = numpy.vstack((self.index, m.digest())) self._ball_index.append(m.digest()) self._index_minhash.append(m) self._index.index() self._X = X self.tree = BallTree(self.index, leaf_size=self._n_leaves)
def minhash_implem(url_shingles_list): list_url_hash = [] for url in range(len(url_shingles_list)): m = MinHash(num_perm=8) shingle_list = url_shingles_list[url][1] for shingle in shingle_list: m.update(shingle.encode('utf8')) list_url_hash.append( ["{0}".format(url_shingles_list[url][0]), m.digest()]) return list_url_hash
def fit(self, X): self.index = numpy.empty([0, 128]) for i, x in enumerate(X): m = MinHash(num_perm=128) for e in x: m.update(str(e).encode('utf-8')) self.index = numpy.vstack((self.index, m.digest())) self._index_minhash.append(m) self._lshf = sklearn.neighbors.LSHForest( n_estimators=self._n_estimators, n_candidates=self._n_candidates) if self._metric == 'angular': X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') self._lshf.fit(self.index)
def minHash_bml(SX, SY): print() print("MinHash BML") l = 32 m = 8 num_perm = pow(2, m) error = pow(10, -5) print("Number of permutations is ", num_perm) m1 = MinHash(num_perm) m2 = MinHash(num_perm) for d in SX: m1.update(d.encode('utf8')) for d in SY: m2.update(d.encode('utf8')) nx = m1.count() ny = m2.count() print("Estimated nx is ", nx) print("Estimated ny is ", ny) Vx = m1.digest() Vy = m2.digest() z = 0 for i in range(0, num_perm): if Vx[i] >= Vy[i]: z = z + 1 P = z / num_perm print("P is: ", P) print("Inclusion Coefficient: ", lookup(P, 0, min(nx, ny), nx, ny, error, m, num_perm, l, 0, 0)) return
def fit(self, X): self.index = numpy.empty([0, 1]) self._index_minhash = [] for i, x in enumerate(X): m = MinHash(num_perm=1) for e in x: m.update(str(e).encode('utf-8')) self.index = numpy.vstack((self.index, m.digest())) self._index_minhash.append(m) self._annoy = annoy.AnnoyIndex(self.index.shape[1]) for i, x in enumerate(self.index): self._annoy.add_item(i, x.tolist()) self._annoy.build(self._n_trees)
def query(self, v, n): print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") m = MinHash(num_perm=self._n_perm) for e in v: m.update(str(e).encode('utf-8')) # for i in self._annoy.get_nns_by_vector(v.tolist(), n, 100): # print(self._index_minhash[int(i)].jaccard(m)) dist, ind = self.tree.query([m.digest()], k=n) for i in ind[0]: # print(i) print(self._index_minhash[int(i)].jaccard(m)) print("=======================") brute_indices = self.query_with_distances(m.digest(), n) for i in brute_indices: print(self._index_minhash[int(i)].jaccard(m)) print("-----------------------") ind2 = self._index.query(m, n) for i in ind2: print(self._index_minhash[int(i)].jaccard(m)) # return map(int, ind[0]) return self.query_with_distances(m.digest(), n)
def extract_attribute(self, base_object: BDFunction) -> int: # Check if value already exists FunctionMinHashLSH_value = base_object.get_attribute_value('FunctionMinHashLSH') if FunctionMinHashLSH_value: pass else: normalized_instr_set: set = set(base_object.get_attribute_value('FunctionNormalized')) # Create MinHash object minhash = MinHash(num_perm=Configuration.MINHASH_PERMUTATIONS, seed=Configuration.MINHASH_SEED) for instr in normalized_instr_set: minhash.update(instr.encode('utf8')) base_object.add_attribute_value('FunctionMinHashLSH', {'function_lsh': minhash.digest()}) FunctionMinHashLSH_value = base_object.get_attribute_value('FunctionMinHashLSH') return FunctionMinHashLSH_value['function_lsh'] if FunctionMinHashLSH_value else None
class Text(Base): """ client class to control api with restful """ def __init__(self, api_key=None, host_url=None): """ Initial sixecho Attributes: api_key(string) - Optional : api_key generate from sixecho host_url(string) - Optional : is sixecho domain """ self.api_key = api_key if host_url is not None: if host_url.endswith("/"): host_url = host_url[:-1] self.host_url = host_url self.array_words = [] self.min_hash = MinHash(num_perm=128) self.max_workers = 1 self.sha256 = "" self.file_size = 0 self.meta_media = None self.type = "TEXT" self.digest = "" self.common_info = {} self.ref_info = {} self.detail_info = {} # def digest(self): # """Export the hash values, which is the internal state of the # MinHash. # Returns: # numpy.array: The hash values which is a Numpy array. # """ # return self.min_hash.digest() def set_detail_info(self, detail_info): """ detail_info: Required "isbn" Options - string "author" Optons - string "publisher" Options - string "published_date Options - integer (unixtimestmap) "language" Options - string "number_of_pages" Options - integer """ self.detail_info = detail_info def generate(self, str=None, txtpath=None, epubpath=None, pdfpath=None): """Generate minhash with new value from string or file we use minhash from https://ekzhu.github.io/datasketch/_modules/datasketch/minhash.html#MinHash.update Args: str(string) - Optional : string whose minhash to be computed. txtpath(string) - Optional : path of text file to be computed. epubpath(string) - Optional : path of epub file to be computed. pdfpath(string) - Optional : path of pdf file to be computed. """ if txtpath: self.load_file(txtpath) elif epubpath: size = len(epubpath.split('.')) name = epubpath.split('.')[size - 2] size = len(name.split('/')) name = name.split('/')[size - 1] name = name.replace("/", "") name = name + '.txt' cur_path = os.path.dirname(os.path.abspath(__file__)) self.write2text(self.readepub(epubpath), name) self.load_file(cur_path + '/' + name) os.remove(cur_path + '/' + name) elif pdfpath: size = len(pdfpath.split('.')) name = pdfpath.split('.')[size - 2] size = len(name.split('/')) name = name.split('/')[size - 1] name = name.replace("/", "") name = name + '.txt' cur_path = os.path.dirname(os.path.abspath(__file__)) self.write2text(self.readpdf(pdfpath), name) self.load_file(cur_path + '/' + name) os.remove(cur_path + '/' + name) else: sha256 = hashlib.sha256() sha256.update(str.encode()) self.sha256 = sha256.hexdigest() self.array_words = tokenize(str) self.file_size = len(str) for d in self.array_words: self.min_hash.update(d.encode('utf8')) self.make_digest() def set_meta(self, meta_books): """ Args: meta_books(Hash) - Require : struct books include - category_id(string) - Require : category of books you can get from search category api - publisher_id(string) - Require : publisher of book you can get from search publisher api - title(string) - Require : title book - auther(string) - Require : auther book - country_of_origin(string) : country iso 3166-1 - language(string) Require : language iso 639-1 - paperback(string) Require : total page book - publish_date(string) Require : publish date """ self.meta_media = meta_books def create_sha256_signature(self, secret, message): secret = str(secret) message = str(message) # print(secret, message) # print(type(secret)) # print(type(message)) secret_byte = str(secret).encode('utf-8') message_byte = str(message).encode('utf-8') signature = hmac.new(secret_byte, message_byte, hashlib.sha256).hexdigest() return signature def make_digest(self): self.digest = ",".join([str(num) for num in self.min_hash.digest()]) def load_file(self, fpath): """ method load_file """ sha256 = hashlib.sha256() f_count = open(fpath, "r") f = f_count.readlines() f_count.close() list_of_groups = None if self.max_workers != 1: l = f n = self.max_workers list_of_groups = [l[i:i + n] for i in range(0, len(l), n)] # list_of_groups = zip(*(iter(f), ) * self.max_workers) file_size = os.path.getsize(fpath) # print_progress_bar(0, # file_size, # prefix='Progress:', # suffix='Complete', # length=50) progress = 0 lines = [] if self.max_workers == 1: for line in f: progress = progress + len(line) sha256.update(line.encode()) words = tokenize(line) if len(words) != 0: for d in words: self.min_hash.update(d.encode('utf8')) # print_progress_bar(progress, # file_size, # prefix='Progress:', # suffix='Complete', # length=50) else: for line in f: sha256.update(line.encode()) for lines in list_of_groups: for line in lines: progress = progress + len(line) words = tokenize_mutiline(lines) if len(words) != 0: for d in words: self.min_hash.update(d.encode('utf8')) # print_progress_bar(progress, # file_size, # prefix='Progress:', # suffix='Complete', # length=50) self.sha256 = sha256.hexdigest() self.file_size = file_size def readepub(self, fpath): list_text = [] book = open_book(fpath) lines = ec.utils.convert_epub_to_lines(book) for line in lines: text = ec.utils.convert_lines_to_text(str(line), "txt") text = list(text) for ele in text: list_text.append(ele) return list_text def readpdf(self, fpath): pdfFileObj = open(fpath, 'rb') # 'rb' for read binary mode pdfReader = PyPDF2.PdfFileReader(pdfFileObj) total_page = pdfReader.numPages # print(total_page) list_text = [] for i in range(total_page): pageObj = pdfReader.getPage(i) list_text.append(pageObj.extractText()) return list_text def write2text(self, list_text, opname): cur_path = os.path.dirname(os.path.abspath(__file__)) fpath = opname print(cur_path) print(cur_path + '/' + fpath) file = open(cur_path + '/' + fpath, 'w') for ele in list_text: file.write(ele) file.close()
#make corpus a dictionary. Needed to calculate true jaccard score. #mycorpus={i+1:set(line.lower().split()) for i,line in enumerate(open(fname,'r')) if i+1 in linestoget} print("--- %s seconds ---" % (time.time() - start_time)) print 'Calculate minhash signatures' start_time = time.time() #prepare dictionary of hashes hashcorp=dict.fromkeys(linestoget) #compute hashes for key,doc in mycorpus:#.iteritems(): #compute minhash signature m=MinHash(num_perm=num_permutations) #for token in doc: m.digest(sha1(token.encode('utf8'))) for token in doc: m.digest(sha1(token)) #for token in doc: m.digest(sha1(token.encode('utf8', 'ignore'))) hashcorp[key]=m print("--- %s seconds ---" % (time.time() - start_time)) if calc_clusters: p=Pool(num_processes) assignment=[ (x,) for x in thresholds] print assignment p.map(compute_clusters,assignment) if calc_match: #create a balanced, pairwise test set #first create cluster to ad dictionary
class ColumnSketch: """A Column Sketch contains a summary of a table column. Args: column_name: the extracted column name. minhash_size: the number of permutations to use for MinHash. minhash_seed: the random seed used by MinHash. hyperloglog_p: the precision parameter used by HyperLogLog. sample_size: the size of sample to be kept. enable_word_vector_data: whether to build word embedding vector for data values -- can be 10x more expensive. """ def __init__( self, column_name, minhash_size=256, minhash_seed=43, hyperloglog_p=8, sample_size=100, enable_word_vector_data=False, model=WordVectorModel, ): self._column_name = column_name self._sample = set([]) self._sample_size = sample_size self._count = 0 self._empty_count = 0 self._oov_count = 0 self._numeric_count = 0 self._minhash = MinHash(num_perm=minhash_size, seed=minhash_seed, hashfunc=self._hashfunc32) self._hhl = HyperLogLogPlusPlus(p=hyperloglog_p, hashfunc=self._hashfunc64) self._enabled_word_vec_data = enable_word_vector_data self._model = model self._sum_vector = self._model.get_empty_word_vector() def _hashfunc32(self, str_value): return farmhash.hash32(str_value) def _hashfunc64(self, str_value): return farmhash.hash64(str_value) @property def column_name(self): """The extracted column name. """ return self._column_name @property def sample(self): """A sample (non-random) of the data values in the column as a list. """ return list(self._sample) @property def count(self): """The total number of data values (i.e. rows) including the empty ones. """ return self._count @property def empty_count(self): """The number of empty data values. """ return self._empty_count @property def non_empty_count(self): """The number of non-empty data values. """ return self._count - self._empty_count @property def out_of_vocabulary_count(self): """The number of data values that are non-empty and outside of the language model's vocabulary. """ return self._oov_count @property def in_vocabulary_count(self): """The number of data values that are non-empty and in the language model's vocabulary. """ return self._count - self._empty_count - self._oov_count @property def numeric_count(self): """The number of data values that are non-empty and numerical. """ return self._numeric_count @property def is_numeric(self): """Whether the column is numeric, based on if at least 50% of rows are numeric. """ if self.non_empty_count == 0: return False return (float(self._numeric_count) / float(self.non_empty_count)) >= 0.5 @property def distinct_count(self): """The approximate distinct count made by the HyperLogLog. """ if len(self._sample) < self._sample_size: return len(self._sample) return max(len(self._sample), self._hhl.count()) @property def word_vector_column_name(self): """The word embedding vector of the column name as a list. """ doc = self._model.process(self.column_name) vectors = [token.vector for token in doc if token.has_vector] if len(vectors) == 0: return None return list(float(v) for v in np.sum(vectors, axis=0)) @property def word_vector_data(self): """The mean word embedding vector of all data values as a list. """ if not self._enabled_word_vec_data: return None if self.in_vocabulary_count == 0: return None vector = self._sum_vector / np.float32(self.in_vocabulary_count) return list(float(v) for v in vector) @property def minhash(self): """The hash values in the MinHash. """ return list(int(v) for v in self._minhash.digest()) @property def seed(self): """The random seed used for MinHash. """ return self._minhash.seed @property def hyperloglog(self): """The register values of the HyperLogLog counter. """ return list(int(v) for v in self._hhl.digest()) def update(self, value): """Add a data value into the sketch. """ # Update counter. self._count += 1 if not isinstance(value, str): value = json.dumps(value, sort_keys=True) # Clean the value value = value.strip().lower() # Skip if the value is empty string. if len(value) == 0: self._empty_count += 1 return if _is_number(value): self._numeric_count += 1 # Add to sample. if len(self._sample) < self._sample_size: self._sample.add(value) # Update the MinHash sketch. self._minhash.update(value) # Update the HyperLogLog sketch. self._hhl.update(value) # Skip word vector extraction if not enabled. if not self._enabled_word_vec_data: return # Update the sum of word embeddings. vectors = [ token.vector for token in self._model.process(value) if token.has_vector ] if len(vectors) > 0: self._sum_vector += np.sum(vectors, axis=0) else: self._oov_count += 1
if args.header: next(f) #TODO test robustness #mycorpus=[(i,set(line.encode('utf8', 'ignore').lower().split())) for i,line in enumerate(f)] mycorpus=[(i,set(line.lower().split())) for i,line in enumerate(f)] print(("--- %s seconds ---" % (time.time() - start_time))) print('Calculate minhash signatures') start_time = time.time() #prepare dictionary of hashes hashcorp=dict.fromkeys([tup[0] for tup in mycorpus]) #compute hashes for key,doc in mycorpus: #compute minhash signature m=MinHash(num_perm=num_permutations) for token in doc: m.digest(sha1(token)) hashcorp[key]=m print(("--- %s seconds ---" % (time.time() - start_time))) if num_processes> 1: if len(thresholds)<num_processes: num_processes=len(thresholds) p=Pool(num_processes) assignment=[ (x,) for x in thresholds] p.map(compute_clusters,assignment) else: for x in thresholds: compute_clusters((x,))
def get_min_hash(text, too_common, num_perm=128): min_hash = MinHash(num_perm=num_perm) for shingle_h in shingle_hashes(text): if shingle_h.hexdigest() not in too_common: min_hash.digest(shingle_h) return min_hash
def getHashSig(tagsListOfPep): minHash = MinHash(num_perm=NUM_PERMUTATION) for tag in tagsListOfPep: minHash.update(tag.encode('utf-8')) return minHash.digest()
class Client(object): """ client class to control api with restful """ def __init__(self, api_key=None, host_url=None, max_workers=1): """ Initial sixecho Attributes: api_key(string) - Optional : api_key generate from sixecho host_url(string) - Optional : is sixecho domain """ self.api_key = api_key deepcut.tokenize("Welcome") # Load library if host_url is not None: if host_url.endswith("/"): host_url = host_url[:-1] self.host_url = host_url self.array_words = [] self.min_hash = MinHash(num_perm=128) self.max_workers = max_workers self.sha256 = "" def digest(self): """Export the hash values, which is the internal state of the MinHash. Returns: numpy.array: The hash values which is a Numpy array. """ return self.min_hash.digest() def generate(self, str=None, fpath=None): """Generate minhash with new value from string or file we use minhash from https://ekzhu.github.io/datasketch/_modules/datasketch/minhash.html#MinHash.update Args: str(string) - Optional : string whose minhash to be computed. fpath(string) - Optional : path file to be computed. """ if fpath: self.load_file(fpath) else: sha256 = hashlib.sha256() sha256.update(str) self.sha256 = sha256.hexdigest() self.array_words = tokenize(str) for d in self.array_words: self.min_hash.update(d.encode('utf8')) def upload(self): """Upload digital conent to server """ digest = ",".join([str(num) for num in self.digest()]) if self.host_url is None or self.api_key is None: raise Exception("Require host_url and api_key") headers = { "x-api-key": self.api_key, 'content-type': 'application/json' } response = requests.post((self.host_url + "/checker"), json={ "digest": digest, "sha256": self.sha256 }, headers=headers) print("content:" + str(response.text)) return json.loads(response.text) def load_file(self, fpath): """ method load_file """ sha256 = hashlib.sha256() f_count = open(fpath, "r") f = f_count.readlines() f_count.close() list_of_groups = None if self.max_workers != 1: l = f n = self.max_workers list_of_groups = [l[i:i + n] for i in range(0, len(l), n)] # list_of_groups = zip(*(iter(f), ) * self.max_workers) fileSize = os.path.getsize(fpath) printProgressBar(0, fileSize, prefix='Progress:', suffix='Complete', length=50) progress = 0 lines = [] if self.max_workers == 1: for line in f: progress = progress + len(line) sha256.update(line) words = tokenize(line) if len(words) != 0: for d in words: self.min_hash.update(d.encode('utf8')) printProgressBar(progress, fileSize, prefix='Progress:', suffix='Complete', length=50) else: for line in f: sha256.update(line) for lines in list_of_groups: for line in lines: progress = progress + len(line) # sha256.update(line) words = tokenize_mutiline(lines) if len(words) != 0: for d in words: self.min_hash.update(d.encode('utf8')) printProgressBar(progress, fileSize, prefix='Progress:', suffix='Complete', length=50) self.sha256 = sha256.hexdigest()
class PradoProjector(Projector): def __init__( self, feature_length: int = None, config: Optional[PradoProjectorConfig] = None, ): super().__init__() if config is None: config = PradoProjectorConfig(feature_length=feature_length) self._config = copy.deepcopy(config) self._hashobj = MinHash(num_perm=self.n_permutations, hashfunc=farmhash.hash32) self._projection_operator = PradoProjectionOperator() self._vectorized_projection = np.vectorize(self.project, signature="()->(n)") # region Properties @property def feature_length(self) -> int: return self._config.feature_length @property def B(self) -> int: return self.feature_length @property def n_permutations(self) -> int: return (2 * self.B + 32 - 1) // 32 # endregion def project(self, x: str): self._hashobj.clear() self._hashobj.update(x) # (4 * n_permutations, ) token_as_bytes = b"".join( int(x).to_bytes(4, "big") for x in self._hashobj.digest()) # (32 * n_permutations, ) token_as_bits = bitarray.bitarray() token_as_bits.frombytes(token_as_bytes) # (2B, ) - MinHash can give us larger hashes than # we need. It is recommended to set B up so this # doesn't destroy/skip data. In other words, B should # be a multiplier of 16. return torch.tensor(token_as_bits[:2 * self.B], dtype=torch.float) def __call__(self, x: List) -> torch.Tensor: # Can be anything, (Any, N[str]) -> (Any, N, 2B) token_features = self._vectorized_projection(x) token_features = torch.tensor(token_features, dtype=torch.float) # (Any, N, 2B) -> (Any, N, B, 2) token_features = torch.reshape(token_features, (*token_features.shape[:-1], -1, 2)) # (Any, N, B, 2) -> (Any, N, B, 1) fingerprint = self._projection_operator(token_features) # (Any, N, B, 1) -> (Any, N, B) fingerprint = torch.squeeze(fingerprint, dim=-1) return fingerprint