예제 #1
0
def main():
    print('Загружаем корпус')
    all_csv = pd.read_csv("./all.csv", encoding="utf-8")
    raw_corpus = all_csv["text"]

    print('Приводим его к стандартному виду')
    normalized_copus: List[List[str]] = [
        normalize(proverb) for proverb in raw_corpus
    ]

    print('Составляем индекс для поиска дублей')
    lsh = MinHashLSH(num_perm=HASH_PERMUTATIONS_COUNT)
    deduplicated_corpus = []
    for i, words in enumerate(normalized_copus):
        words_hash = to_minhash(words)
        duplicates = lsh.query(words_hash)
        if duplicates:
            print(f'Найдены совпадения для ({i}): {raw_corpus[i]}')
            all_csv.drop(duplicates)
            for idx in duplicates:
                print(f'\t{idx:>5d}. {raw_corpus[idx]}')
        else:
            lsh.insert(i, words_hash)
            deduplicated_corpus.append(raw_corpus[i])
    print('Удалено дублей:', len(raw_corpus) - len(deduplicated_corpus))

    print(
        f'Сохраняем дедуплицированный корпус ({len(deduplicated_corpus)} рецензий)'
    )
    all_csv.to_csv("./all_deduplicated.csv", encoding="utf-8", index=False)
예제 #2
0
def find_duplicates(minhashes, threshold, permutations, name_hashes):
    """
    Find the duplicates amongst the minhashes.

    Arguments:
    - minhashes: a list of minhashes
    - threshold: the Jaccard threshold for similarity / identity
    - permutations: the number of permutations. Must be the same as for the
                    minhash objects
    - name_hashes: list of document hashes (or any ID type, really). If not
                   empty, similarities between documents with the same ID are
                   taken for granted and are not reported.
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    for i, mh in enumerate(minhashes, start=1):
        lsh.insert(str(i), mh, check_duplication=False)
    for i, mh in enumerate(minhashes, start=1):
        similar = lsh.query(mh)
        similar.remove(str(i))
        if name_hashes:
            # Remove matches that occur in the same document
            similar = [
                s for s in similar
                if name_hashes[i - 1] != name_hashes[int(s) - 1]
            ]
        if similar:
            print('{}\t{}'.format(i, ' '.join(similar)))
예제 #3
0
def get_topn_similarity_documents_lsh(keywords, n=3):
    lsh = MinHashLSH(threshold=0.1, num_perm=128)
    documents_en = docs_col.find({"lang": 'english'})
    documents_min = [
        lsh_json(str(item["_id"]), item["keyword"]) for item in documents_en
    ]
    for item in documents_min:
        minhash = MinHash(num_perm=128)
        list_keyword = item["keyword"].split(",")
        for k in list_keyword:
            minhash.update(k.encode("utf-8"))
        lsh.insert(str(item["id"]), minhash)

    min = MinHash(num_perm=128)
    keywords = keywords.split(",")
    for k in keywords:
        # print(k)
        min.update(k.encode("utf-8"))
    result = lsh.query(min)
    list_docs = []
    if result:
        for item in result:
            doc = docs_col.find_one({"_id": ObjectId(str(item))})
            doc.pop('_id', None)
            list_docs.append(doc)
    print(list_docs)
    return list_docs
예제 #4
0
class LSH():
    def __init__(self,rawlist,shingle_length=2,threshold=0.8):
        self.indoc = rawlist
        self.make_lsh(shingle_length=shingle_length,threshold=threshold)
        
    def make_shingles(self,doc,length=2):
        s = []
        for i in range(len(doc)-(length-1)):
            s.append(doc[i:i+length])
        return s

    def make_shingle_sets(self,doclst=None,length=2):
        if doclst == None: doclst=self.indoc
        sets = {}
        for d in doclst:
            sets[d] = self.make_shingles(d,length)    
        return sets
          
    def make_lsh(self,shingle_length=2,threshold=0.8):
        print(f'Making LSH with threshold of {threshold}, shingle length of {shingle_length}')
        sets = self.make_shingle_sets(self.indoc,shingle_length)
        self.minhashes = {}
        self.lsh = MinHashLSH(threshold=threshold, num_perm=128)
        for k in sets.keys():
            m = MinHash(num_perm=128)
            for item in sets[k]:
                m.update(item.encode('utf8'))
                self.minhashes[k] = m
            self.lsh.insert(k,m)
    def get_minhash(self,doc):
        return self.minhashes[doc]
    def get_bucket(self,target_mh):
        return self.lsh.query(target_mh)
예제 #5
0
def LSH():
    return_result = []
    result = part1.readFile(k=4)
    num_perm = 1024
    '''
    threshold (float)  – Jaccard 距离阈值设定,默认为0.5
    num_perm (int, optional) – 哈希置换函数设定个数,在weighted-MinHash中为样本规模大小。
    params (tuple, optional) – bands 的数量与规模大小。
    '''
    lsh = MinHashLSH(threshold=0.9, num_perm=num_perm)  #num_perm=128
    index = 1
    for each in result:
        #每一个each是一个set
        doc = MinHash(num_perm=num_perm)
        for d in each:
            doc.update(d.encode('utf8'))
        lsh.insert(str(index), doc)
        index = index + 1

    for each_doc in result:
        doc_target = MinHash(num_perm=num_perm)
        for e in each_doc:
            doc_target.update(e.encode('utf8'))
        re = lsh.query(doc_target)
        print("Approximate neighbours with Jaccard similarity > 0.35", re)
        return_result.append(re)
    return clean_data(return_result)
예제 #6
0
def deduplicate_file(file_prefix, output_dir, threshold, permutations):
    """
    Deduplicates a set of minhashed documents (3 files with the same minhash
    prefix) and writes them to output_dir.

    Warning: only works for full documents at this point!
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    file_base = op.basename(file_prefix)
    logging.info('Processing batch {}...'.format(file_base))
    total_read = 0
    with closing(
            BatchWriter(sys.maxsize, output_dir, len(file_base),
                        int(file_base))) as bw:
        for input_file, results in read_batch(file_prefix):
            minhashes, new_minhashes = results['minhash'], []
            doc_ids, new_doc_ids = results['id'], []
            total_read += len(doc_ids)
            for i, minhash in enumerate(minhashes):
                if not lsh.query(minhash):
                    lsh.insert('_'.join(doc_ids[i]), minhash)
                    new_minhashes.append(minhash)
                    new_doc_ids.append(doc_ids[i])
            bw.write_results(input_file, {
                'id': new_doc_ids,
                'minhash': new_minhashes
            })
            logging.debug('Kept {} documents out of {}'.format(
                len(new_doc_ids), len(doc_ids)))
    logging.info('Processed batch {}; kept {} documents out of {}.'.format(
        file_base, bw.total_written, total_read))
예제 #7
0
def build_content_sim_mh_text(network, mh_signatures):
    def connect(nid1, nid2, score):
        network.add_relation(nid1, nid2, Relation.CONTENT_SIM, score)

    # Materialize signatures for convenience
    mh_sig_obj = []

    content_index = MinHashLSH(threshold=0.7, num_perm=512)

    # Create minhash objects and index
    for nid, mh_sig in mh_signatures:
        mh_obj = MinHash(num_perm=512)
        mh_array = np.asarray(mh_sig, dtype=int)
        mh_obj.hashvalues = mh_array
        content_index.insert(nid, mh_obj)
        mh_sig_obj.append((nid, mh_obj))

    # Query objects
    for nid, mh_obj in mh_sig_obj:
        res = content_index.query(mh_obj)
        for r_nid in res:
            if r_nid != nid:
                connect(nid, r_nid, 1)

    return content_index
예제 #8
0
def main() -> None:
    for _ in tqdm(range(1), desc="Create finding example:"):
        minhash = MinHash(num_perm=256)
        list_strings = []
        for _ in range(200):
            rand_string = ''.join(
                random.choice(string.ascii_lowercase) for i in range(5))
            list_strings.append(rand_string)
        minhash.update_batch([s.encode('utf-8') for s in list_strings])

    for _ in tqdm(range(1), desc="Connect to existing db:"):
        lsh = MinHashLSH(threshold=0.5,
                         num_perm=256,
                         storage_config={
                             'type': 'cassandra',
                             'basename': b'perftest',
                             'cassandra': {
                                 'seeds': ['127.0.0.1'],
                                 'keyspace': config.KEY_SPACE,
                                 'replication': {
                                     'class': 'SimpleStrategy',
                                     'replication_factor': '1',
                                 },
                                 'drop_keyspace': False,
                                 'drop_tables': False,
                             }
                         })

    try:
        for _ in tqdm(range(1), desc="Find minHash similarity:"):
            result = lsh.query(minhash)
        print("Approximate neighbours with Jaccard similarity > 0.5", result)
    except BaseException as e:
        print(str(e))
        print("Error")
예제 #9
0
def remove_similar_tweets(df, text_col="text", lang_col="lang_x", max_jaccard_simularity=0.5):
    """
    use local similarity hashing to efficiently remove tweets that are similar to others
    (might be autogenerated or retweets)
    english tweets only
    """
    t0 = time.time()
    df["tweet_clean"] = np.vectorize(TweetsAnalysis.preprocess_tweet)(df[text_col], df[lang_col])
    tweets = [t.split(" ") for t in df["tweet_clean"]]
    t1 = time.time()
    print t1 - t0, "cleaned tweets"
    lsh = MinHashLSH(threshold=max_jaccard_simularity, num_perm=64)  # jaccard similarity
    idx_selected = {}
    df_indices = df.index.values.tolist()
    for idx, tweet in zip(df_indices, tweets):
        s = MinHash(num_perm=64)
        for word in tweet:
            s.update(word.encode('utf8'))
        # only add if the tweet is not similar to existing ones
        if len(lsh.query(s)) == 0:
            lsh.insert(idx, s)
            idx_selected[idx] = True
    t2 = time.time()
    print t2 - t1, "created lsh"
    # only select the first tweet in a group of similar tweets
    df['select'] = pd.Series([idx_selected.get(idx, False) for idx in df_indices], index=df_indices)
    print df["select"].value_counts()
    t3 = time.time()
    print t3-t2, "selected df"
    return df[df["select"]]
예제 #10
0
def minHash_LSH(data):
    # Create an MinHashLSH index optimized for Jaccard threshold 0.5,
    # that accepts MinHash objects with 128 permutations functions
    # Create LSH index
    lsh = MinHashLSH(threshold=0.65, num_perm=256)
    
    # Create MinHash objects
    minhashes = {}
    for c, i in enumerate(data):
      #c è l'indice, i è la tupla
      #print(i)
      minhash = MinHash(num_perm=256)
      for el in i:
          minhash.update(el.encode('utf8'))
#      for d in ngrams(i, 3):
#        minhash.update("".join(d).encode('utf-8'))
      lsh.insert(c, minhash)
      minhashes[c] = minhash
      #print(str(c)+" "+str(minhashes[c]))
      
    res_match=[]
    for i in range(len(minhashes.keys())):
      result = lsh.query(minhashes[i])
      
      if result not in res_match and len(result)==2:
          res_match.append(result)
          #print("Candidates with Jaccard similarity > 0.6 for input", i, ":", result)
    #print(res)
#    for i in range(len(res_match)):
#        print(data[res_match[i][0]])
#        print(data[res_match[i][1]])
    return res_match
예제 #11
0
def search_lsh_jaccard_topk(index_data, query_data, b, r, k):
    (index_sets, index_keys, index_minhashes) = index_data
    (query_sets, query_keys, query_minhashes) = query_data
    num_perm = b * r
    print("Building LSH Index.")
    start = time.perf_counter()
    index = MinHashLSH(num_perm=num_perm, params=(b, r))
    # Use the indices of the indexed sets as keys in LSH.
    for i in range(len(index_keys)):
        index.insert(i, index_minhashes[num_perm][i])
    end = time.perf_counter()
    print("Indexing time: {:.3f}.".format(end-start))
    print("Querying.")
    times = []
    results = []
    for query_minhash, query_key, query_set in \
            zip(query_minhashes[num_perm], query_keys, query_sets):
        start = time.perf_counter()
        result = index.query(query_minhash)
        # Recover the retrieved indexed sets and 
        # compute the exact Jaccard similarities.
        result = [[index_keys[i], compute_jaccard(query_set, index_sets[i])]
                               for i in result]
        # Sort by similarity.
        result.sort(key=lambda x : x[1], reverse=True)
        # Take the first k.
        result = result[:k]
        duration = time.perf_counter() - start
        times.append(duration)
        results.append((query_key, result))
        sys.stdout.write(f"\rQueried {len(results)} sets")
    sys.stdout.write("\n")
    return (results, times)
예제 #12
0
def main():
    """Точка входа в приложение."""
    corpus_root = Path('corpus/clean')
    """Находим названия всех файлов"""
    list_files = file_searcher(corpus_root)

    print('Загружаем корпус')
    raw_corpus = []
    for file in list_files:
        with open(file, 'r', encoding='utf-8') as src:
            text_news = '\n'.join([line.rstrip('\r\n') for line in src])
        raw_corpus.append(text_news)

    print('Приводим его к стандартному виду')
    normalized_copus: List[List[str]] = [
        normalize(news) for news in raw_corpus
    ]

    print('Составляем индекс для поиска дублей')
    dst = open('duplicate.txt', 'w', encoding='utf-8')

    lsh = MinHashLSH(num_perm=HASH_PERMUTATIONS_COUNT)
    deduplicated_corpus = []
    for i, (file, words) in enumerate(zip(list_files, normalized_copus)):
        words_hash = to_minhash(words)
        duplicates = lsh.query(words_hash)
        if duplicates:
            print(f'Найдены совпадения для ({file}): {raw_corpus[i]}',
                  file=dst)
            for idx in duplicates:
                print(f'\t{list_files[idx]}. {raw_corpus[idx]}', file=dst)
            print('\n\n\n\n', file=dst)
        else:
            lsh.insert(i, words_hash)
            deduplicated_corpus.append((raw_corpus[i], list_files[i]))
    print('Удалено дублей:',
          len(raw_corpus) - len(deduplicated_corpus),
          file=dst)

    print(
        f'Сохраняем дедуплицированный корпус ({len(deduplicated_corpus)} новостей)'
    )

    # Создаем пустые папки
    all_genre = [
        'Политика', 'В мире', 'Экономика', 'Общество', 'Происшествия', 'Армия',
        'Наука', 'Культура', 'Религия', 'Спорт', 'Туризм'
    ]
    import os
    for genre in all_genre:
        newpath = 'corpus/super clean/' + genre
        if not os.path.exists(newpath):
            os.makedirs(newpath)

    # Сохраняем корпус
    for text, name in deduplicated_corpus:
        with open('corpus/super clean/' + name[13:], 'w',
                  encoding='utf-8') as dst:
            print(text, file=dst)
예제 #13
0
class DuplicationIndex:
    def __init__(
        self,
        *,
        duplication_jaccard_threshold: float = 0.85,
    ):
        self._duplication_jaccard_threshold = duplication_jaccard_threshold
        self._num_perm = NUM_PERM
        self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm)

        self._duplicate_clusters = defaultdict(set)

    def add(self, code_key: Tuple, min_hash: MinHash) -> None:
        """Add a key to _index (MinHashLSH)
        the min_hash is used to query closest matches based on the jaccard_threshold.
        The new key is either added to a existing cluster of one close match,
        or a new cluster is created. The clusters created in this way, depend on the order of add.

        Args:
            code_key (Tuple of (index, repo_name, path)):
                Theoritically any hasbale key. Here we use a tuple to retrieve the information later.
            min_hash: MinHash of the code_key.
        """
        close_duplicates = self._index.query(min_hash)
        if code_key in self._index.keys:
            print(f"Duplicate key {code_key}")
            return

        self._index.insert(code_key, min_hash)
        if len(close_duplicates) > 0:

            for base_duplicate in close_duplicates:
                if base_duplicate in self._duplicate_clusters:
                    self._duplicate_clusters[base_duplicate].add(code_key)
                    break
            else:
                self._duplicate_clusters[close_duplicates[0]].add(code_key)

    def get_duplicate_clusters(self) -> List[List[Dict]]:
        """Export the duplicate clusters.
        For each cluster, the first element is the base element of the cluster.
        The base element has an estimation jaccard similarity higher than the threshold with all the other elements.

        Returns:
            duplicate_clusters (List[List[Dict]]):
                List of duplicate clusters.
        """
        duplicate_clusters = []
        for base, duplicates in self._duplicate_clusters.items():
            cluster = [base] + list(duplicates)
            # reformat the cluster to be a list of dict
            cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster]
            duplicate_clusters.append(cluster)
        return duplicate_clusters

    def save(self, filepath) -> None:
        duplicate_clusters = self.get_duplicate_clusters()
        with open(filepath, "w") as f:
            json.dump(duplicate_clusters, f)
예제 #14
0
def find_relation_class_name_matchings(network, kr_handlers):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_sources = []
    for (db_name, source_name, _, _) in network.iterate_values():
        original_source_name = source_name
        if source_name not in seen_sources:
            seen_sources.append(source_name)  # seen already
            source_name = nlp.camelcase_to_snakecase(source_name)
            source_name = source_name.replace('-', ' ')
            source_name = source_name.replace('_', ' ')
            source_name = source_name.lower()
            m = MinHash(num_perm=32)
            for token in source_name.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('relation', (db_name, original_source_name), m))

    num_relations_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            m = MinHash(num_perm=32)
            for token in cl.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('class', (kr_name, original_cl_name), m))

    # Index all the minhashes
    lsh_index = MinHashLSH(threshold=0.5, num_perm=32)

    for idx in range(len(names)):
        lsh_index.insert(idx, names[idx][2])

    matchings = []
    for idx in range(0, num_relations_inserted):  # Compare only with classes
        N = lsh_index.query(names[idx][2])
        for n in N:
            kind_q = names[idx][0]
            kind_n = names[n][0]
            if kind_n != kind_q:
                # match.format is db_name, source_name, field_name -> class_name
                match = ((names[idx][1][0], names[idx][1][1], "_"),
                         names[n][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (name): " + str(et - st))
    return matchings
예제 #15
0
def perform_lsh(lsh_text,
                standard_labels,
                title_labels,
                char_ngram=5,
                savefile=''):
    t0 = time.time()
    shingled_desc = [shingles(desc) for desc in lsh_text]
    print_elapsed(t0, 'splitting the text into groups of characters')

    #Create hash signatures for shingles
    t0 = time.time()
    hash_objects = []
    for i in range(len(shingled_desc)):
        m = MinHash(num_perm=200)
        hash_objects.append(m)
    print_elapsed(t0, 'creating hash signatures')

    t0 = time.time()
    for ix, desc in enumerate(shingled_desc):
        for d in desc:
            hash_objects[ix].update(d.encode('utf8'))
    print_elapsed(t0, 'encoding hash objects')

    #Define LSH and Jaccard similarity threshold
    lsh = MinHashLSH(threshold=0.8, num_perm=200)

    content = []
    for ix, desc in enumerate(shingled_desc):
        content.append((standard_labels[ix], hash_objects[ix]))

    for ix, elem in enumerate(content):
        #lsh.insert('{}'.format(ix), elem[1]) #elem[0], elem[1])
        lsh.insert(elem[0], elem[1])

    #For each standard search all signatures and identify potential clashes (e.g. other standards with Jaccard similarity
    #of shingle sets greater or equal to the threshold). Note: some of the candidates might be false positives.
    candidates = {}
    for ix, desc in enumerate(shingled_desc):
        result = lsh.query(hash_objects[ix])
        if len(result) > 1:
            candidates[standard_labels[ix] + ': ' + title_labels[ix]] = [
                (res, df_nos['Title'].loc[res]) for res in result
            ]
            #candidates.append(result)
            print(standard_labels[ix] + ': ' + title_labels[ix], ': ',
                  [(res, df_nos['Title'].loc[res]) for res in result])
            #print(standard_labels[ix], ': ',result)
            print('***************')
        else:
            candidates[standard_labels[ix]] = 'none'

    if len(savefile):
        pd.DataFrame.from_dict(candidates, orient='index').to_csv(savefile)
    return candidates, shingled_desc, content, lhs
예제 #16
0
def deduplicate_other_old(file_prefix, input_dir, output_dir, threshold,
                          permutations):
    """
    Removes all documents from a set of minhashed documents (3 files with the
    same minhash prefix) that occur in other batches in input_dir. Only
    batches whose number is higher than the batch in question are considered
    (i.e. upper triangular matrix).

    Warning: only works for full documents at this point!
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    file_base = op.basename(file_prefix)
    logging.info('Processing batch {}...'.format(file_base))

    # First, load the (already deduplicated) batch...
    for input_file, results in read_batch(file_prefix):
        for doc_id, minhash in zip(results['id'], results['minhash']):
            lsh.insert('\t'.join(doc_id), minhash)

    initial_len = len(lsh.keys)
    to_match_with = find_all_batches(input_dir,
                                     int(file_prefix.rpartition(os.sep)[-1]))

    # Now, remove all documents in it that are contained in other batches
    # to the "right" of it (with greater batch numbers)
    for batch in to_match_with:
        initial_batch_len = len(lsh.keys)
        for _, results in read_batch(batch):
            for i, minhash in enumerate(results['minhash']):
                for duplicate in lsh.query(minhash):
                    lsh.remove(duplicate)
        logging.info(
            'Cross-deduplicated batch {} with batch {}: {} -> {} documents.'.
            format(file_base, op.basename(batch), initial_batch_len,
                   len(lsh.keys)))

    # Finally, we print the documents left. Unfortunately, in order to
    # keep the format, we have to read the original batch again.
    with closing(
            BatchWriter(sys.maxsize, output_dir, len(file_base),
                        int(file_base))) as bw:
        # OK, we need to re-read the batch unfortunately
        for input_file, results in read_batch(file_prefix):
            doc_ids, minhashes = [], []
            for doc_id, minhash in zip(results['id'], results['minhash']):
                if '\t'.join(doc_id) in lsh:
                    doc_ids.append(doc_id)
                    minhashes.append(minhash)
            bw.write_results(input_file, {'id': doc_ids, 'minhash': minhashes})
    logging.info('Processed batch {}; kept {} out of {} documents.'.format(
        file_base, len(lsh.keys), initial_len))
    return len(lsh.keys), initial_len
def compare_products(product1, product2):
    """Checks if the two given series/dicts(strictly) belong to the same product
    The keys should strictly be followed as per the dataset.
    """

    if product1['imageUrl'] == product2['imageUrl']:
        print('Yes')
        return
    text_cols = ['key_specs_text', 'description', 'title']
    id1 = product1['productId']

    check_image = False
    product1['full_text'] = ''
    product2['full_text'] = ''

    for col in text_cols:
        product1['full_text'] += ' ' + product1[col].translate(table)
        product2['full_text'] += ' ' + product2[col].translate(table)

    m1 = MinHash(num_perm=258)
    m2 = MinHash(num_perm=258)

    for d in ngrams(product1['full_text'], 3):
        m1.update(d.encode('utf-8'))
    for d in ngrams(product2['full_text'], 3):
        m2.update(d.encode('full_text'))

    lsh = MinHashLSH(threshold=0.9, num_perm=256)
    lsh.insert(id1, m1)
    result = lsh.query(m2)
    if id1 in result:
        print('Similar Text')
        check_image = True
    if not check_image:
        print('No')
    else:
        print(product1['imageUrl'])
        print(product2['imageUrl'])

        img1 = download_image(product1['imageUrl'])
        img2 = download_image(product2['imageUrl'])

        img1 = np.expand_dims(extract_features(product1['imageUrl'], img1),
                              axis=0)
        img2 = np.expand_dims(extract_features(product2['imageUrl'], img2),
                              axis=0)

        cosine_mat = cosine_similarity(img1, img2)
        if cosine_mat > 0.5:
            print('Yes')
        else:
            print('No')
def learn_duplicates(name, f, verbose=False):
    print(name)
    logging.basicConfig(level=logging.DEBUG)
    texts_sample = [
        item['extracted_text'] for item in item_reader(f, name, limit=300)]
    dupe_predictor = DupePredictor(texts_sample)

    lsh = MinHashLSH(threshold=0.9, num_perm=128)  # separate from dupe_predictor
    too_common_shingles = dupe_predictor.too_common_shingles
    threshold = 0.98
    y_pred, y_true = [], []
    def _report_pr():
        tp = sum(p > threshold and d for p, d in zip(y_pred, y_true))
        fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true))
        fn = sum(p < threshold and d for p, d in zip(y_pred, y_true))
        n_dup = tp + fn
        print('precision: %.3f, recall %.3f at %.2f threshold '
                '(%d duplicates)' % (
            tp / (tp + fp) if tp else 0.,
            tp / n_dup if n_dup else 0., threshold, n_dup))
    for i, item in enumerate(item_reader(f, name)):
        dupe_prob = dupe_predictor.get_dupe_prob(item['url'])
        y_pred.append(dupe_prob)
        min_hash = get_min_hash(item['extracted_text'], too_common_shingles)
        if dupe_prob < threshold:
            duplicates = [url for url, _ in dupe_predictor.update_model(
                item['url'], item['extracted_text'])]
        else:
            # We think this is a duplicate: replicate crawling
            # and do not update the model.
            duplicates = list(lsh.query(min_hash))
        key = canonicalize_url(item['url'])
        if key in lsh:
            lsh.remove(key)
        lsh.insert(key, min_hash)
        y_true.append(bool(duplicates))
        if verbose:
            if duplicates and dupe_prob < threshold:
                path = _full_path(item['url'])
                sample = [url for url in duplicates
                          if _full_path(url) == path] or duplicates
                print('false negative %s (%s, %d more)' % (
                    item['url'], sample[0], len(sample) - 1))
            elif not duplicates and dupe_prob > threshold:
                print('false positive', item['url'])
        if i % 100 == 0:
            _report_pr()
    _report_pr()
예제 #19
0
def main():
    path = Path('C:/Data/Python/JobLoss')
    orig_data = []
    ind_map = []
    ind = 0
    with open(path / 'Processed.json') as f:
        data = json.load(f)
        for tweet in data:
            if tweet['type'] != 'retweet':
                orig_data.append(tweet['orig_text'])
                ind_map.append(ind)
            ind += 1
            # orig_data.append(tweet['orig_text'])
    markers = [0 for _ in range(len(orig_data))]
    lsh = MinHashLSH(threshold=0.5, num_perm=128)
    minhashes = {}
    for c, i in enumerate(orig_data):
        # print(c)
        minhash = MinHash(num_perm=128)
        for d in ngrams(i, 5):
            minhash.update(''.join(d).encode('utf-8'))
        lsh.insert(c, minhash)
        minhashes[c] = minhash
    for i in range(len(minhashes.keys())):
        result = lsh.query(minhashes[i])
        if markers[i] == 2:
            continue
        markers[i] = 1
        for j in result:
            if markers[j] != 1:
                markers[j] = 2
    doc_set = set()
    similar_removed = [
        data[ind_map[ind]] for ind, val in enumerate(markers) if val != 2
    ]
    final = []
    identicals = 0
    for line in similar_removed:
        doc = ' '.join(line['text'])
        if doc in doc_set:
            identicals += 1
            continue
        doc_set.add(doc)
        final.append(line)
    print(identicals)
    print(len(final))
    with open(path / 'ProcessedSimilarRemoved.json', 'w') as f:
        json.dump(final, f)
예제 #20
0
def lsh_similar(minhashes: Dict[T, MinHash], num_perm: int, bands: int, rows: int) -> Generator[Tuple[T, T], None, None]:
    """Yields all of similar pairs of minhashes using LSH

    minhashes - Dictionary of key to Minhash
    num_perm  - Number of permutations used in Minhash
    bands     - Number of bands to use in LSH
    rows      - Number of rows to use in LSH

    """
    lsh = MinHashLSH(num_perm=num_perm, params=(bands, rows))
    for i, mh in minhashes.items():
        # Check if duplicate of already seen item
        for j in lsh.query(mh):
            yield (j, i)
        # Add to the seen items
        lsh.insert(i, mh)
예제 #21
0
def benchmark_lsh(threshold, index_data, query_data):
    print("Building LSH index")
    num_perm = len(index_data.minhashes[0].hashvalues)
    lsh = MinHashLSH(threshold, num_perm)
    for key, minhash in zip(index_data.filenames, index_data.minhashes):
        lsh.insert(key, minhash)
    print("Querying")
    times = []
    results = []
    for minhash in query_data.minhashes:
        start = time.clock()
        result = lsh.query(minhash)
        duration = time.clock() - start
        times.append(duration)
        results.append(result)
    return times, results
예제 #22
0
def benchmark_lsh(threshold, index_data, query_data):
    print("Building LSH index")
    num_perm = len(index_data.minhashes[0].hashvalues)
    lsh = MinHashLSH(threshold, num_perm)
    for key, minhash in zip(index_data.filenames, index_data.minhashes):
        lsh.insert(key, minhash)
    print("Querying")
    times = []
    results = []
    for minhash in query_data.minhashes:
        start = time.clock()
        result = lsh.query(minhash)
        duration = time.clock() - start
        times.append(duration)
        results.append(result)
    return times, results
class DuplicateChecker:
    def __init__(self):
        self.minhashes = {}
        self.lsh = MinHashLSH(threshold=THRESHOLD)

    def create_minhashes_reading_articles(self, start_date, end_date):
        """Fills the minhashes dict with the files paths as the keys and the minhashes from the articles bodies as
         the values"""
        for category in read_categories_from_file():
            for date_between in get_dates_between(start_date, end_date):
                try:
                    date_between = date_between.strftime('%Y/%m/%d')
                    current_dir_path = f'{DUMP_DIR}/{category}/{date_between}'
                    for filename in os.listdir(current_dir_path):
                        self._create_minhash_from_file(current_dir_path,
                                                       filename)
                except FileNotFoundError:
                    pass

    def _create_minhash_from_file(self, current_dir_path, filename):
        file_path = f'{current_dir_path}/{filename}'
        with open(file_path) as f:
            article = Article(**json.load(f))
            if not article.body:
                os.remove(file_path)
                return

            minhash = MinHash()
            for word in article.body.split(' '):
                minhash.update(word.encode('utf8'))
            lean_minhash = LeanMinHash(minhash)
            self.minhashes[file_path] = lean_minhash
            self.lsh.insert(file_path, lean_minhash)

    def find_similar_articles(self):
        """Finds every similar article from the LSH index, and removes it from the index itself as well as the file from
        the disk"""
        for path, minhash in self.minhashes.items():
            # The LSH will find at least the path itself, so we need to filter it
            for similar_article_path in [
                    x for x in self.lsh.query(minhash) if x is not path
            ]:
                print(
                    f'\tremoving similar article from {similar_article_path}')
                self.lsh.remove(similar_article_path)
                with contextlib.suppress(FileNotFoundError):
                    os.remove(similar_article_path)
예제 #24
0
def benchmark_lsh(num_perm, threshold, index_data, query_data):
    print("Building LSH index")
    lsh = MinHashLSH(threshold, num_perm)
    for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
        lsh.insert(key, minhash)
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = lsh.query(minhash)
        duration = time.clock() - start
        times.append(duration)
        results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                               for key in result], 
                              key=lambda x : x[1], reverse=True))
    return times, results
예제 #25
0
    def similarity_threshold_bulk(self,
                                  df_library,
                                  df_query,
                                  only_positive=False,
                                  return_df=False):
        """
        Takes a dataframe of 'library' strings to query against, and a dataframe of query strings. 
        Gives these unique IDs.
        Transforms both the library and the query strings into minhash objects.
        If return_df==True then df_query will be returned with a column showing how many similar utterances 
            have been found in df_library.
        TODO: maybe use redis in production
        """
        from datasketch import MinHashLSH

        lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
        data_library = self.dataframe_to_data_list(df_library, 'lib_')
        data_query = self.dataframe_to_data_list(df_query, 'query_')

        # use an insertion session to create an lsh object with all the lib data that can be queried
        with lsh.insertion_session() as session:
            for key, minhash in data_library:
                session.insert(key, minhash)

        # bulk query the data_query objects against lsh
        query_results = []
        df_query['no_similar'] = 0

        for key, minhash in data_query:
            query_result = lsh.query(minhash)
            query_result_length = len(query_result)

            if return_df:
                df_query.loc[key, 'no_similar'] = len(query_result)
            elif only_positive:
                # only need to care about only_positive if not returning a dataframe
                if query_result_length > 0:
                    query_results.append(
                        (key, query_result, query_result_length))
            else:
                query_results.append((key, query_result, query_result_length))

        if return_df:
            return df_query
        else:
            return query_results
예제 #26
0
    def _index_records(self, records):
        """
          Constructs Minhash LSH buckets for a given set of records

          Args:
            records (dict) : dict of (record_id -> record_value)

          Returns:
            None
        """
        indexer = defaultdict(list)

        # Create minhashes
        minhashes = {}
        for rid in records:
            m = MinHash(num_perm=self._num_perm)
            for d in records[rid]:
                qgrams = set(self.nt.basic(d, 2))
                for gram in qgrams:
                    m.update(gram.encode('utf-8'))
            minhashes[rid] = m

        # Create LSH instance and add min hashes
        if self._bands == MinHashLSHRecordDeduplication.BANDS and self._rows == MinHashLSHRecordDeduplication.ROWS:
            lsh = MinHashLSH(threshold=self._threshold,
                             num_perm=self._num_perm)
        else:
            lsh = MinHashLSH(num_perm=self._num_perm,
                             params=(self._bands, self._rows))

        max_blocks = []
        for rid in records:
            lsh.insert(rid, minhashes[rid])
            max_blocks.append(rid)

        # Generate blocks
        while (len(max_blocks) > 0):
            key = max_blocks[0]
            bucket = lsh.query(minhashes[key])
            for rid in bucket:
                if rid in max_blocks:
                    max_blocks.remove(rid)
                indexer["b" + str(self._block_index)].append(rid)
            self._block_index += 1

        self._write_indexer(indexer)
예제 #27
0
def lsh_clustering(
    signatures: List[np.ndarray],
    threshold: float = 0.5,
    num_perm: int = 128,
):
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    with lsh.insertion_session() as session:
        for key, minhash in enumerate(signatures):
            session.insert(f"id-{key}",
                           MinHash(num_perm=num_perm, hashvalues=minhash))

    neighbors: List[List[int]] = []

    for key, minhash in enumerate(signatures):
        result = lsh.query(MinHash(num_perm=num_perm, hashvalues=minhash))
        neighbors.append([int(x.split("-")[1]) for x in result])

    return neighbors
def mass_values_jaccard(cols1: List[Column], cols2: List[Column]):
    lsh = MinHashLSH(
        threshold=0.2,
        num_perm=128,
        storage_config={
            "type": "redis",
            "redis": {
                "host": "localhost",
                "port": 6379
            }
        },
    )

    with lsh.insertion_session() as session:
        for idx, col in enumerate(cols1):
            session.insert(str(idx), col.values)

    result = lsh.query()
예제 #29
0
def validate(Session, event):
    host, redis_url, port = os.environ["REDIS_URL"].split(":")
    redis_url = redis_url.replace("//", "")
    print({'host': redis_url, 'port': port})
    lsh = MinHashLSH(
        storage_config={
            'type': 'redis',
            'redis': {
                'host': redis_url,
                'port': port
            },
            'basename': b'digital_checker',
        })
    uid = uuid.uuid4().hex
    body = event["body-json"]
    print(body)
    api_key_id = event["context"]["api-key-id"]
    try:
        digest_str = body["digest"]
        meta_books = body["meta_media"]
        validate_params(Session, meta_books)
    except Exception as e:
        print("Error " + str(e))
        return {"statusCode": 200, "body": json.dumps({"message": str(e)})}
    m1 = convert_str_to_minhash(digest_str)
    result = lsh.query(m1)
    if len(result) > 0:
        return {
            "statusCode": 200,
            "body": json.dumps({
                "message": "Duplicate",
            }),
        }
    else:
        insert_mysql(Session, api_key_id, uid, body)
        lsh.insert(key=uid, minhash=m1)
        return {
            "statusCode": 200,
            "body": json.dumps({
                "message": "Ok",
                "id": uid,
            }),
        }
예제 #30
0
def deduplicate_self(file_prefix, output_dir, threshold, permutations):
    """
    Deduplicates a set of minhashed documents (3 files with the same minhash
    prefix) and writes them to output_dir.

    Warning: only works for full documents at this point!
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    file_base = op.basename(file_prefix)
    logging.info('Processing batch {}...'.format(file_base))
    total_read = 0
    duplicate_urls = 0
    with closing(
            BatchWriter(sys.maxsize, output_dir, len(file_base),
                        int(file_base))) as bw:
        for input_file, results in read_batch(file_prefix):
            minhashes, new_minhashes = results['minhash'], []
            doc_ids, new_doc_ids = results['id'], []
            total_read += len(doc_ids)
            input_duplicate_urls = 0
            for doc_id, minhash in zip(doc_ids, minhashes):
                key = '_'.join(doc_id)
                if key in lsh:
                    input_duplicate_urls += 1
                    continue
                if not lsh.query(minhash):
                    lsh.insert(key, minhash)
                    new_minhashes.append(minhash)
                    new_doc_ids.append(doc_id)
            bw.write_results(input_file, {
                'id': new_doc_ids,
                'minhash': new_minhashes
            })
            duplicate_urls += input_duplicate_urls
            logging.debug('Kept {} documents out of {} in file {}; '
                          '{} duplicate urls.'.format(len(new_doc_ids),
                                                      len(doc_ids), input_file,
                                                      input_duplicate_urls))
    logging.info('Deduplicated batch {}; kept {} documents out of {}; '
                 '{} duplicate urls.'.format(file_base, bw.total_written,
                                             total_read, duplicate_urls))
    return bw.total_written, total_read
예제 #31
0
    def get_most_similar(self, threshold=0.5, num_perm=128, ngrams_num=3):
        lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

        minhashes = {}
        data = self.create_data()
        for single_data in data:
            minhash = MinHash(num_perm=num_perm)
            file_name = single_data['file_name']
            content = single_data['content']
            for d in ngrams(content, ngrams_num):
                minhash.update("".join(d).encode('utf-8'))
            lsh.insert(file_name, minhash)
            minhashes[file_name] = minhash
        for file_name in minhashes.keys():
            result = lsh.query(minhashes[file_name])
            # 排除自身文件,若存在相似度大于0.5,则打印
            result.remove(file_name)
            if len(result) > 0:
                print("Candidates with Jaccard similarity > 0.5 for input ",
                      file_name, ":", result)
예제 #32
0
	def consolidate_dupes(self, agg_files):
		# Remove short items
		for key, value in agg_files.items():
			for fkey in list(value['files'].keys()):
				# print("File params: ", value['files'][fkey].keys())
				if not 'content_text' in value['files'][fkey]:
					print("Missing file:", key, fkey)
					value['files'].pop(fkey)
				elif len(value['files'][fkey]['content_text']) < 100:
					print("Removing short file: ", (key, fkey))
					value['files'].pop(fkey)

		smap = {}
		for key, value in agg_files.items():
			for fkey in value['files']:
				smap[(key, fkey)] = value['files'][fkey]['content_text']

		perms = 512
		gram_sz = 10
		thresh = 0.5
		lsh = MinHashLSH(threshold=thresh, num_perm=perms)

		print("Loading word hashes")
		minhashes = {}

		with ProcessPoolExecutor(max_workers=10) as ex:
			print("Submitting jobs")
			futures = [(key, ex.submit(minhash_str, content, perms, gram_sz))
					for
						key, content
					in
						smap.items()
				]
			print("Submitted %s jobs. Consuming futures" % len(futures))
			for key, future in tqdm.tqdm(futures, "Hashing"):
				minhash = future.result()
				lsh.insert(key, minhash)
				minhashes[key] = minhash


		lens = {}
		for key, content in smap.items():
			clen = len(content)
			lens.setdefault(clen, [])
			lens[clen].append(key)
		lenl = list(lens.keys())
		lenl.sort()

		print("%s items in file map before dupe elimination" % len(smap))

		for clen in lenl:
			tgt_keys = lens[clen]
			for key in tgt_keys:
				if key not in smap:
					continue
				if key not in minhashes:
					continue

				result = lsh.query(minhashes[key])
				if key in result:
					result.remove(key)
				if result:
					still_ok = [tmp for tmp in result if tmp in smap]
					if still_ok:
						smap.pop(key)
						akey, fkey = key
						agg_files[akey]['files'].pop(fkey)

					# for res in result:
					# print(key)
					# print("Similar: ", result)

		print("%s items in file map after dupe elimination" % len(smap))

		return agg_files
예제 #33
0
def print_stats(
        f, show=None, skip_unique=False, max_int_value=5, duration_limit=None,
        print_duplicates=False, print_urls=False, limit=None):
    stats = Counter()
    if not skip_unique:
        lsh = MinHashLSH(threshold=0.9, num_perm=128)
        too_common = get_too_common_shingles(f, limit=1000)
    urls = {}
    min_timestamp = max_timestamp = None
    for i, item in enumerate(item_reader(f, limit=limit)):
        if print_urls:
            print(item['url'])
        content_type = item.get('content_type', 'missing')
        stats.update([
            'content_type: ' + content_type,
            'content_type[0]: ' + content_type.split('/')[0]])
        if min_timestamp is None:
            min_timestamp = item['timestamp']
        max_timestamp = item['timestamp']
        if duration_limit and \
                (max_timestamp - min_timestamp) / 1000 > duration_limit:
            break
        if 'extracted_text' not in item:
            assert item['obj_stored_url']
            stats.update(['documents'])
            continue
        stats.update(['items'])
        for key, value in item['extracted_metadata'].items():
            if key == 'forms':
                for form in value:
                    stats.update(['form_{}'.format(form['form'])])
                    stats.update(['form_field {}'.format(f)
                                  for f in form['fields'].values()])
            if isinstance(value, list):
                value = len(value)
            if isinstance(value, int) and not isinstance(value, bool):
                if value >= max_int_value:
                    value = '{}+'.format(max_int_value)
                key = '{}_{}'.format(key, value)
            if value:
                stats.update([key])
                if key == show:
                    print(item['url'])
        if not skip_unique:
            min_hash = get_min_hash(item['extracted_text'], too_common)
            duplicates = lsh.query(min_hash)
            if not duplicates:
                stats.update(['unique_items'])
            elif print_duplicates:
                print('{} {} duplicates: {}'.format(
                    item['url'], len(duplicates),
                    ' '.join(urls[k] for k in duplicates[:10])))
            key = 'item_{}'.format(i)
            lsh.insert(key, min_hash)
            urls[key] = item['url']

    if max_timestamp and min_timestamp:
        stats['duration'] = (max_timestamp - min_timestamp) / 1000
    for k, v in sorted(stats.items()):
        print(k.ljust(20), v)
    return stats
# Create MinHash objects
m = []
for i in range(0,allshingle.__len__()):
    m.append(MinHash(num_perm=128))


for i in range(allshingle.__len__()):
    for d in allshingle[i]:
        m[i].update(d.encode('utf8'))


# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=1, num_perm=128)

# Insert m into the index
for i in range(0, m.__len__()):
    lsh.insert("m%d"%i, m[i])

# Search all the frequent shingle which frequency bigger than 100
result = []
for i in range(0, m.__len__()):
    if len(lsh.query(m[i])) > 100:
        result.append(lsh.query(m[i]))

#Find the frequency of the shingle
index = []
for i in range(0,result.__len__()):
    tem = len(result[i])
    index.append(tem)
예제 #35
0
class NearDuplicate(EtlProcessor):
    """A class that acts over the raw tweets collected from the twitter stream
       in order to detect whether the tweet is duplicate, near-duplicate or
       nothing at all"""

    punct = re.compile(r"[\.,;:]\\xe2", re.IGNORECASE)

    langs = {
        "es": "spanish",
        "en": "english"
    }

    process_count = 0

    def __init__(
        self,
        connector=None,
        lang='en',
        threshold=0.8,
        permutations=90,
        autostart=True
    ):
        self.permutations = permutations
        self.threshold = threshold
        self.lang = lang
        self.connector = None
        self.lsh = None

        EtlProcessor.__init__(self, connector=connector, autostart=autostart)
        if autostart:
            self.load()
            self.listen()

    def listen(self):
        """Performs a model check on whether the current tweet
        resembles at least to a 80% level as other previous tweets"""
        for msg in self.connector.listen():
            tweet = json.loads(msg.value())
            try:
                if self.is_unique(tweet):
                    self.connector.send(
                        msg.value()
                    )
                    self.connector.log(
                        json.dumps({
                            "id_str": tweet['id_str'],
                            "source": self.connector.consumer_topic,
                            "dest": self.connector.producer_topic
                        })
                    )
            except ValueError:
                self.connector.send(
                    json.dumps({
                        "id_str": tweet['id_str'],
                        "source": self.connector.consumer_topic,
                        "dest": "error"
                    })
                )
                continue
            finally:
                self.process_count += 1
                if self.process_count % 1000 == 0:
                    self.save()

    def load(self):
        """Loads the stored model data from previous runs"""
        if os.path.isfile('./minhash-%s-%.2f.pkl' % (self.lang, self.threshold)):
            self.lsh = pickle.load(
                open(
                    './minhash-%s--%d-%.2f.pkl' % (
                        self.lang,
                        self.permutations,
                        self.threshold
                    ),
                    'rb'
                )
            )
        else:
            self.lsh = MinHashLSH(
                threshold=self.threshold,
                num_perm=self.permutations
            )

    def save(self):
        """Stores the currently processed data for this model"""
        pickle.dump(
            self.lsh,
            open(
                './minhash-%s--%d-%.2f.pkl' % (
                    self.lang,
                    self.permutations,
                    self.threshold
                ),
                'wb+'
            )
        )

    def replace_urls(self, tweet):
        """Convenience function that replaces the compressed URLs by
        their expanded counterparts, in order to treat the same real URL
        as it is (and not obfuscating the same URL in diferent tweets by
        a different t.co link)"""
        removed_characters = 0
        if 'entities' in tweet and 'urls' in tweet['entities']:
            for url in tweet['entities']['urls']:
                tweet['text'] = tweet['text'][:(url['indices'][0] - removed_characters - 1)] + \
                    tweet['text'][(url['indices'][1] - removed_characters - 1):]
                removed_characters += url['indices'][1] - url['indices'][0]
            for url in tweet['entities']['urls']:
                tweet['text'] += ' ' + url['expanded_url']
        return tweet

    @lru_cache(maxsize=1e06)
    def minhash_tweet(self, tweet_text):
        """Minhashing operation that allows for a caching of up to
        1M tweets in order to speed up the checking procedure when it's
        the same tweet text"""
        tweet_hash = MinHash(num_perm=self.permutations)
        for word in tweet_text.split():
            tweet_hash.update(
                self.punct.sub(
                    "",
                    word.encode('utf8')
                )
            )
        return tweet_hash

    def is_unique(self, tweet):
        """Core method to check whether this tweet resembles enough to other previous
        tweets to label it as unique or near-duplicate"""
        is_unique_tweet = False
        urlfied_tweet = self.replace_urls(tweet)
        mht = self.minhash_tweet(
            urlfied_tweet['text']
        )
        if 'minteressa' not in tweet:
            tweet['minteressa'] = {}
        if self.lsh.is_empty() is not True:
            similars = self.lsh.query(mht)
            if len(similars) == 0:
                # It's a unique tweet
                try:
                    self.lsh.insert(
                        tweet['id_str'],
                        mht
                    )
                    is_unique_tweet = True
                except ValueError:
                    logging.error(ValueError)
            else:
                # nondupe
                for tweet_idx in similars:
                    if 'nearduplicates' not in tweet['minteressa']:
                        tweet['minteressa']['nearduplicates'] = 0

                tweet['minteressa']['nearduplicates'] += 1
        else:
            is_unique_tweet = True
            self.lsh.insert(
                tweet['id_str'],
                mht
            )
        return is_unique_tweet
예제 #36
0
def minhash_merger_series(interactive=True):


	matchlogger = MatchLogBuilder()
	if interactive:
		callback=askuser_callback_series
	else:
		callback=matchlogger.add_match_series

	print("fetching series")
	with app.app_context():
		items = models.Series.query.options(
			joinedload(Series.alternatenames)
			).all()
		altn = []
		for item in items:
			for name in item.alternatenames:
				altn.append((name.id, name.series, name.cleanname, item.title))

	print("Building mapping dictionaries")
	# Map altname id to series id
	altnid_sid_dict  = dict([(tmp[0], tmp[1]) for tmp in altn])
	altnid_name_dict = dict([(tmp[0], tmp[2]) for tmp in altn])
	sid_sname_dict   = dict([(tmp[1], tmp[3]) for tmp in altn])

	sid_altnid_dict = {}
	for nid, sid in altnid_sid_dict.items():
		sid_altnid_dict.setdefault(sid, [])
		sid_altnid_dict[sid].append(nid)


	print("Have %s altnames for %s series" % (len(altnid_sid_dict), len(sid_altnid_dict)))

	perms = 512
	gram_sz = 3
	minhashes = {}
	lsh = MinHashLSH(threshold=SIMILARITY_RATIO, num_perm=perms)

	print("Building lsh minhash data structure")
	with ProcessPoolExecutor(max_workers=8) as ex:
		print("Submitting jobs")
		futures = [(key, ex.submit(minhash_str, content, perms, gram_sz))
				for
					key, content
				in
					altnid_name_dict.items()
				if
					len(content) >= 5
			]

		print("Consuming futures")
		for key, future in tqdm.tqdm(futures):
			minhash = future.result()
			lsh.insert(key, minhash)
			minhashes[key] = minhash

	print("Doing search")

	for key, minhash in minhashes.items():

		result = lsh.query(minhashes[key])
		if key in result:
			result.remove(key)
		if result:
			sid = altnid_sid_dict[result[0]]
			src_sid = altnid_sid_dict[key]
			if sid != src_sid:
				sname = sid_sname_dict[sid]
				res_sids = set([altnid_sid_dict[tmp] for tmp in result])
				names = []
				for res_id in result:
					if altnid_sid_dict[res_id] != src_sid:
						names.append((altnid_sid_dict[res_id], res_id, altnid_name_dict[res_id]))
				if names:
					names.sort()
					print("Search returned %s results in %s series for %s:%s" % (len(result), len(res_sids), src_sid, sname))
					for sid, nid, name in names:
						print("	%s -> %s: %s" % (str(sid).rjust(8), str(nid).rjust(8), name))


	if not interactive:
		matchlogger.save_log("./seriesname-matchset-minhash.json")