Пример #1
0
 def digest(self,
            search_result: [str],
            click: {str: int},
            norm=10.0) -> np.array:
     if self.dim_search is not None:
         vec1 = [0.0] * self.dim_search
         for path in search_result:
             if '/' not in path:
                 return None
             cat_ids = path.split('/')[1:]
             for i, d in enumerate(cat_ids):
                 lv = i + 1
                 index = self.catid2index[d] % self.dim_search
                 value = 1.0 if farmhash.hash64withseed(
                     d, 4321) % 2 == 0 else -1.0
                 vec1[index] += value / lv
     vec2 = [0.0] * self.dim_clicks
     for path, count in click.items():
         if '/' not in path:
             return None
         cat_ids = path.split('/')[1:]
         for i, d in enumerate(cat_ids):
             lv = i + 1
             index = self.catid2index[d] % self.dim_clicks
             value = 1.0 if farmhash.hash64withseed(d,
                                                    4321) % 2 == 0 else -1.0
             vec2[index] += value / lv * count
     if self.dim_search is not None:
         vec1 = np.array(vec1)
         vec2 = np.array(vec2)
         ret = np.concatenate((vec1, vec2))
     else:
         ret = vec2
     ret = ret / (np.linalg.norm(ret) / norm)
     if np.isnan(ret).any():
         return None
     return ret
Пример #2
0
 def get_signatures(self, queries) -> [[str]]:
     signatures = [] # size == self.L (table count)
     table_values = self.digest_all_tables(queries)
     # table_values: signaures across L hash tables
     for table_value in table_values:
         table_value = map(
             lambda bits: ''.join(map(str, bits)), table_value)
         if self.bucket_limit is not None:
             table_value = map(
                 lambda v: str(
                     farmhash.hash64withseed(v, 2048) % self.bucket_limit),
                 table_value
             )
         table_value = list(table_value)
         signatures.append(table_value)
     return signatures
Пример #3
0
 def get_signatures(self, queries) -> [[str]]:
     # length==self.L*self.F
     # shape==(self.L*self.F) * N_queries
     signatures = [] 
     # table_values: signaures across L hash tables
     table_values = self.digest_all_tables(queries)
     # table_value: an (F*N_queries) * K matrix
     for table_value in table_values:
         table_value = map(
             lambda bits: ''.join(map(str, bits)), table_value)
         if self.bucket_limit is not None:
             table_value = map(
                 lambda v: str(
                     farmhash.hash64withseed(v, 2048) % self.bucket_limit),
                 table_value
             )
         # length==(F*N_queries)
         table_value = np.array(list(table_value))
         # shape== F * N_queries
         table_value = table_value.reshape(-1, self.F).T
         signatures.extend(table_value)
     assert len(signatures[-1]) == len(queries), "invalid shape on sigs axis 1"
     return signatures
Пример #4
0
 def __call__(self, x):
     """Returns the hash x modulus self.modulus."""
     val = farmhash.hash64withseed(str(x), self._random_seed)
     return val if self.modulus is None else val % self._modulus