def digest(self, search_result: [str], click: {str: int}, norm=10.0) -> np.array: if self.dim_search is not None: vec1 = [0.0] * self.dim_search for path in search_result: if '/' not in path: return None cat_ids = path.split('/')[1:] for i, d in enumerate(cat_ids): lv = i + 1 index = self.catid2index[d] % self.dim_search value = 1.0 if farmhash.hash64withseed( d, 4321) % 2 == 0 else -1.0 vec1[index] += value / lv vec2 = [0.0] * self.dim_clicks for path, count in click.items(): if '/' not in path: return None cat_ids = path.split('/')[1:] for i, d in enumerate(cat_ids): lv = i + 1 index = self.catid2index[d] % self.dim_clicks value = 1.0 if farmhash.hash64withseed(d, 4321) % 2 == 0 else -1.0 vec2[index] += value / lv * count if self.dim_search is not None: vec1 = np.array(vec1) vec2 = np.array(vec2) ret = np.concatenate((vec1, vec2)) else: ret = vec2 ret = ret / (np.linalg.norm(ret) / norm) if np.isnan(ret).any(): return None return ret
def get_signatures(self, queries) -> [[str]]: signatures = [] # size == self.L (table count) table_values = self.digest_all_tables(queries) # table_values: signaures across L hash tables for table_value in table_values: table_value = map( lambda bits: ''.join(map(str, bits)), table_value) if self.bucket_limit is not None: table_value = map( lambda v: str( farmhash.hash64withseed(v, 2048) % self.bucket_limit), table_value ) table_value = list(table_value) signatures.append(table_value) return signatures
def get_signatures(self, queries) -> [[str]]: # length==self.L*self.F # shape==(self.L*self.F) * N_queries signatures = [] # table_values: signaures across L hash tables table_values = self.digest_all_tables(queries) # table_value: an (F*N_queries) * K matrix for table_value in table_values: table_value = map( lambda bits: ''.join(map(str, bits)), table_value) if self.bucket_limit is not None: table_value = map( lambda v: str( farmhash.hash64withseed(v, 2048) % self.bucket_limit), table_value ) # length==(F*N_queries) table_value = np.array(list(table_value)) # shape== F * N_queries table_value = table_value.reshape(-1, self.F).T signatures.extend(table_value) assert len(signatures[-1]) == len(queries), "invalid shape on sigs axis 1" return signatures
def __call__(self, x): """Returns the hash x modulus self.modulus.""" val = farmhash.hash64withseed(str(x), self._random_seed) return val if self.modulus is None else val % self._modulus