def __double_hash(self, i, x):
        """
        double Hashing 
        给定两个彼此独立的哈希函数 hasha 和 hashb,可以通过如下的哈希函数创建一个新的哈希函数: 
        hash_i(x, m) = (hasha(x) + i * hashb(x)) mod m
        依次类推,我们可以生成 第 i 个新的哈希函数
        :param i: 
        :param x: 被 Hash 的值 
        :return: 
        """
        if type(x) != str:
            x = str(x)

        return (murmurhash.hash(x) +
                i * fnvhash.fnv0_32(bytes(x, encoding="utf8"))) % self.m
示例#2
0
 def get_stream():
     keys_a = [
         key for key, _ in s2v_a.frequencies[:n_freq] if key not in seen
     ]
     keys_b = [
         key for key, _ in s2v_b.frequencies[:n_freq] if key not in seen
     ]
     while len(keys_a):
         key = random.choice(keys_a)
         keys_a.remove(key)
         word, sense = s2v_a.split_key(key)
         if sense in exclude_senses or (senses is not None
                                        and sense not in senses):
             continue
         if key not in keys_b:
             continue
         similar_a = set(
             [k for k, _ in s2v_a.most_similar(key, n=n_similar)])
         similar_b = set(
             [k for k, _ in s2v_b.most_similar(key, n=n_similar)])
         overlap = similar_a.intersection(similar_b)
         options = [
             {
                 "id": "A",
                 "html": get_option_html(similar_a, overlap)
             },
             {
                 "id": "B",
                 "html": get_option_html(similar_b, overlap)
             },
         ]
         random.shuffle(options)
         task_hash = murmurhash.hash(key)
         task = {
             "html": get_term_html(key),
             "text": key,
             "options": options,
             TASK_HASH_ATTR: task_hash,
             INPUT_HASH_ATTR: task_hash,
         }
         if show_mapping:
             opt_map = [
                 f"{opt['id']} ({mapping[opt['id']]})" for opt in options
             ]
             task["meta"] = {i + 1: opt for i, opt in enumerate(opt_map)}
         yield task
示例#3
0
 def get_stream():
     keys = [key for key, _ in s2v.frequencies[:n_freq] if key not in seen]
     while len(keys):
         key = random.choice(keys)
         keys.remove(key)
         word, sense = s2v.split_key(key)
         if sense in exclude_senses or (senses is not None
                                        and sense not in senses):
             continue
         most_similar = s2v.most_similar(key, n=n_similar)
         options = [{
             "id": k,
             "html": get_html(k, s)
         } for k, s in most_similar]
         task_hash = murmurhash.hash(key)
         task = {
             "html": get_html(key, large=True),
             "text": key,
             "options": options,
             "accept": [key for key, _ in most_similar],  # pre-select all
             TASK_HASH_ATTR: task_hash,
             INPUT_HASH_ATTR: task_hash,
         }
         yield task
示例#4
0
 def get_stream():
     strategy_func = eval_strategies.get(strategy)
     log(f"RECIPE: Using strategy {strategy}")
     # Limit to most frequent entries
     keys = [key for key, _ in s2v.frequencies[:n_freq]]
     keys_by_sense = defaultdict(set)
     for key in keys:
         try:
             sense = s2v.split_key(key)[1]
         except ValueError:
             continue
         if (senses is None
                 or sense in senses) and sense not in exclude_senses:
             keys_by_sense[sense].add(key)
     keys_by_sense = {
         s: keys
         for s, keys in keys_by_sense.items() if len(keys) >= 3
     }
     all_senses = list(keys_by_sense.keys())
     total_keys = sum(len(keys) for keys in keys_by_sense.values())
     log(f"RECIPE: Using {total_keys} entries for {len(all_senses)} senses")
     n_passes = 1
     while True:
         log(f"RECIPE: Iterating over the data ({n_passes})")
         current_keys = copy.deepcopy(keys_by_sense)
         while any(len(values) >= 3 for values in current_keys.values()):
             sense = random.choice(all_senses)
             all_keys = list(current_keys[sense])
             key_a, key_b, key_c, sim_ab, sim_ac = strategy_func(
                 s2v, all_keys)
             if len(set([key_a.lower(),
                         key_b.lower(),
                         key_c.lower()])) != 3:
                 continue
             if sim_ab < threshold or sim_ac < threshold:
                 continue
             for key in (key_a, key_b, key_c):
                 current_keys[sense].remove(key)
             confidence = 1.0 - (min(sim_ab, sim_ac) / max(sim_ab, sim_ac))
             input_hash = murmurhash.hash(key_a)
             task_hash = murmurhash.hash(" ".join([key_a] +
                                                  sorted([key_b, key_c])))
             task = {
                 "label":
                 "Which one is more similar?",
                 "html":
                 get_html(key_a, large=True),
                 "text":
                 f"{key_a}: {key_b}, {key_c}",
                 "key":
                 key_a,
                 "options": [
                     {
                         "id": key_b,
                         "html": get_html(key_b, sim_ab),
                         "score": sim_ab,
                     },
                     {
                         "id": key_c,
                         "html": get_html(key_c, sim_ac),
                         "score": sim_ac,
                     },
                 ],
                 "confidence":
                 confidence,
                 TASK_HASH_ATTR:
                 task_hash,
                 INPUT_HASH_ATTR:
                 input_hash,
             }
             if show_scores:
                 task["meta"] = {
                     "confidence": f"{confidence:.4}",
                     "strategy": strategy,
                 }
             yield task
         n_passes += 1
 def minhash(self, text, num_shingels, window=25):  # assume len(text) > 50
     hashes = [murmurhash.hash(text[i:i + window]) for i in range(len(text) - window + 1)]
     return set(sorted(hashes)[0:num_shingels])
示例#6
0
 def hash(self, i, seed, ntweak, data):
     hseed = (i*seed + ntweak) & 0xffffffff
     hs = mh.hash(data, hseed)
     return hs % len(self.bits)
示例#7
0
    def similar_k(input_sentences,
                  sentence_encoder,
                  corpus_index,
                  db_session,
                  limit=10,
                  method='union',
                  group_by='cosine'):
        """Find similar sentences.

        Args:
            input_sentences (str/list[str]): one or more input sentences.
            sentence_encoder  : encoder
            limit (int): limit result set size to ``limit``.
            corpus_index : type of corpus where to fetch the suggestions from
            db_session  : Database to get neighbors from
            method (str): aggregation method ('union', 'mean', 'pc1', 'pc2').
            group_by (str): distance metric to use to group the result set. Default is 'cosine'.

        Returns:
            list<dict>
        """
        res = []
        nearest = dict()

        if method == 'textrank':
            from nlp.textrank import calc_textrank  # pylint: disable=import-outside-toplevel
            _, _, _, phrase_list = calc_textrank(input_sentences,
                                                 num_phrases=5)
            input_sentences = [' '.join(phrase[0] for phrase in phrase_list)]
            method = Aggregation.UNION

        embeddings = sentence_encoder.encode(input_sentences)
        indices = [murmurhash.hash(sent) for sent in input_sentences]

        for idx, dist in corpus_index.knn_query_batch(embeddings,
                                                      ids=indices,
                                                      limit=limit,
                                                      method=method):
            if idx not in nearest:
                nearest[idx] = dist
            else:
                nearest[idx] = min(nearest[idx], dist)

        for sentence in db_session.query(Sentence).filter(
                Sentence.id.in_(nearest.keys())).all():
            sentence_dict = sentence.to_dict()
            encoding = sentence_encoder.encode(sentence.sentence)
            distances = scipy.spatial.distance.cdist(encoding, embeddings,
                                                     group_by)
            nearest_idx = int(np.argmax(distances))
            sentence_dict['nearest'] = indices[nearest_idx]
            sentence_dict['dist'] = nearest[sentence.id]
            res.append(sentence_dict)

        return {
            'results':
            sorted(res, key=lambda x: x['dist']),
            'sentences': [{
                'id': sent_id,
                'text': sent
            } for sent_id, sent in zip(indices, input_sentences)]
        }