예제 #1
0
 def fit(self, X):
     self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep)
     for i, x in enumerate(X):
         m = MinHash(num_perm = self._n_perm)
         for e in x:
             m.update(str(e))
         self._index.add(str(i), m)
     self._index.index()
예제 #2
0
def estimateDistinctElements(items, num_perm):
    """This function will estimate the number of distinct elements in a list.
       The default number of hash function permutations is num_perm(128), but 
       I asjusted after researching more-
       http://blog.cluster-text.com/tag/minhash/"""
    h = MinHash(num_perm)  # creates a minhash object with the parameter 
    for item in items:     # being the number of hash permutations
        h.digest(sha1(item.encode('utf8')))  # digests the minhash signatures 
    print("Estimated number of elements: ", h.count())
예제 #3
0
def estimateDistinctElementParallel(listOfItems, num_perm):
    """Same as above, except here we have a nested for loop to iterate through the 
       lists in the list. This function will also append the estimation result 
       to a list for use in the following accuracy function."""
    h = MinHash(num_perm)
    for item in listOfItems:
        for i in item:  # nested for loop to iterate over lists within a list
            h.digest(sha1(i.encode('utf8')))
    estimate.append(h.count())
    print("Estimated number of elements: ", h.count())
예제 #4
0
 def minhash_tweet(self, tweet_text):
     """Minhashing operation that allows for a caching of up to
     1M tweets in order to speed up the checking procedure when it's
     the same tweet text"""
     tweet_hash = MinHash(num_perm=self.permutations)
     for word in tweet_text.split():
         tweet_hash.update(
             self.punct.sub(
                 "",
                 word.encode('utf8')
             )
         )
     return tweet_hash
예제 #5
0
def main():
    path = Path('C:/Data/Python/JobLoss')
    orig_data = []
    ind_map = []
    ind = 0
    with open(path / 'Processed.json') as f:
        data = json.load(f)
        for tweet in data:
            if tweet['type'] != 'retweet':
                orig_data.append(tweet['orig_text'])
                ind_map.append(ind)
            ind += 1
            # orig_data.append(tweet['orig_text'])
    markers = [0 for _ in range(len(orig_data))]
    lsh = MinHashLSH(threshold=0.5, num_perm=128)
    minhashes = {}
    for c, i in enumerate(orig_data):
        # print(c)
        minhash = MinHash(num_perm=128)
        for d in ngrams(i, 5):
            minhash.update(''.join(d).encode('utf-8'))
        lsh.insert(c, minhash)
        minhashes[c] = minhash
    for i in range(len(minhashes.keys())):
        result = lsh.query(minhashes[i])
        if markers[i] == 2:
            continue
        markers[i] = 1
        for j in result:
            if markers[j] != 1:
                markers[j] = 2
    doc_set = set()
    similar_removed = [
        data[ind_map[ind]] for ind, val in enumerate(markers) if val != 2
    ]
    final = []
    identicals = 0
    for line in similar_removed:
        doc = ' '.join(line['text'])
        if doc in doc_set:
            identicals += 1
            continue
        doc_set.add(doc)
        final.append(line)
    print(identicals)
    print(len(final))
    with open(path / 'ProcessedSimilarRemoved.json', 'w') as f:
        json.dump(final, f)
예제 #6
0
def DIDsamplingLittle(dataset,BF,username,userid,attra_id,beta,clustdict):
    # dataset = Cora_labeled.objects.all()
    # clustdict = dextrapreclustering.minhashPreClustering(dataset)
    cluster_membership = {}

    # values = models.sigirCoraAttrValue.objects.filter(attr_id=attra_id)
    # attrasynonyms = models.sigirCoraValueSynonym.objects.filter(value_id__in=[ value.id for value in values])
    # record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym_id__in=[ syn.id for syn in attrasynonyms])
    record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym__value__attr_id=attra_id)
    record_noAttra = dataset.exclude(id__in = [ item.cora_id for item in record_hasAttra])




    for k, v in clustdict.items():
        for d in v:
            cluster_membership[d] = k

    sum = dataset.count()
    for record in dataset:
        # AC
        cora2ae = models.sigirCoraToAttrEntity.objects.filter(cora_id=record.id,user=username)
        if cora2ae:
            list = [ item.attrsynonym.value.attr.id for item in cora2ae]
            if attra_id in list:
                record.orderscore = 0
                record.save()
                continue
            else:
                ac = 1-len(list)/models.sigirCoraAttr.objects.filter(userid=userid).count()
        else:
            ac = 1

        # distribution on dataset
        k = cluster_membership[record.id]
        ic = len(clustdict[k])/record_noAttra.count()
        record_minhash = MinHash(num_perm=128)
        s = set(record.cleantext.split(" "))
        for d in s:
            record_minhash.update(d.encode('utf8'))
        term2sum = 0
        for rr in BF:
            rr_minhash =  MinHash(num_perm=128)
            ss = set(dataset.get(id = rr).cleantext.split(" "))
            for dd in ss:
                rr_minhash.update(dd.encode('utf8'))
            sim = record_minhash.jaccard(rr_minhash)
            sim = (sim/sum)**beta
            term2sum = term2sum + sim
        did = ac*ic*term2sum
        record.orderscore = did
        record.save()
    return dataset
    def read_observations(self, ifp):
        rd = csv.DictReader(ifp)
        count = 0
        for row in rd:
            if row == rd._fieldnames:
                continue

            count += 1
            if count % 100000 == 0:
                progress(str(count))

            location = ObsvLocation(
                canon_url_syntax(row["url"]).geturl(),
                row["country"], row["timestamp"],
                row["as.owner"], row["vpn"])

            flags = row["flags"]
            pld = row["payload"]
            key = flags + "|" + pld

            if key in self.tok_payloads:
                tp = self.tok_payloads[key]
            else:
                self.tok_payloads[key] = tp = tokenize_payload(flags, pld)

            if key in self.discarded:
                continue

            discard, m_content, m_structure = self.mcp.match(tp, location)
            if discard:
                self.discarded.add(key)
                continue

            self.locations[tp.structure].append(location)
            self.locations[tp.content].append(location)

            if m_content:
                self.m_content_t.add(tp.content)
                for m in m_content:
                    self.m_content[m].add(tp.content)

            elif tp.content not in self.hashes:
                ch = MinHash(num_perm=128)
                for ct in tp.content_t:
                    ch.update(ct.encode('utf-8'))
                self.hashes[tp.content] = (ch, len(tp.content_t))

            if m_structure:
                self.m_structure_t.add(tp.structure)
                for m in m_structure:
                    self.m_structure[m].add(tp.structure)
            elif tp.structure not in self.hashes:
                sh = MinHash(num_perm=128)
                for st in tp.structure_t:
                    sh.update(st.encode('utf-8'))
                self.hashes[tp.structure] = (sh, len(tp.structure_t))

        progress(str(count))
예제 #8
0
파일: doc.py 프로젝트: rbramwell/textpipe
 def similarity(self, other_doc, metric='jaccard', hash_method='minhash'):
     """
     Computes similarity for two documents.
     Only minhash Jaccard similarity is implemented.
     >>> doc1 = Doc('Sentence for computing the minhash')
     >>> doc2 = Doc('Sentence for computing the similarity')
     >>> doc1.similarity(doc2)
     0.7265625
     """
     if hash_method == 'minhash' and metric == 'jaccard':
         hash1 = MinHash(hashvalues=self.minhash)
         hash2 = MinHash(hashvalues=other_doc.minhash)
         return hash1.jaccard(hash2)
     else:
         raise NotImplementedError(f'Metric/hash method combination {metric}'
                                   f'/{hash_method} is not implemented as similarity metric')
예제 #9
0
def query_sim(in_dir):

    js = json.load(codecs.open(in_dir, "r"))
    line = js["content_p"]
    seg_list = jieba.cut(line, cut_all=False)
    no_list = []
    for word in seg_list:
        if word not in stopword:
            no_list.append(word)

    mh = MinHash(num_perm=128)
    for word in no_list:
        mh.update(word.encode('utf8'))

    result = forest.query(mh, 1)
    return mh.jaccard(forest[result[0]])
예제 #10
0
    def calc_hash(self, bytez):
        # Create an MinHashLSH index optimized for Jaccard threshold 0.5,
        # that accepts MinHash objects with 128 permutations functions
        lsh = MinHashLSH(threshold=0.9, num_perm=128)

        # Generate MinHash objects.
        minhashes = {}
        for c, i in enumerate(bytez):
            min_hash = MinHash()
            for d in ngrams(i, 3):
                min_hash.update("".join(str(d)).encode("utf-8"))
            lsh.insert(c, min_hash)
            minhashes[c] = min_hash

        return minhashes
        """
예제 #11
0
def query_candidates(doc, topn):
    lsh = load_lsh()
    minhash = MinHash(num_perm=128)
    content = convert_text(doc['content'])
    ngram = ngrams_token(remove_punctuation(content), 3)
    for gram in ngram:
        minhash.update(gram.encode('utf-8'))
    result = lsh.query(minhash, topn)

    print(doc['title'])
    if result:
        for item in result:
            doc = docs_col.find_one({"_id": ObjectId(str(item))})
            print(doc['title'])
    print("=====================")
    return result
예제 #12
0
def convert_str_to_minhash(digest):
    """Convert string that is including 128 numbers which to have a comma as middle between that numbers.
    Ex. 13241234,213242134,22342234,23423423,...,21341234 (128 numbers.)
    """
    data_array = np.array(digest.split(","), dtype=np.uint64)
    m1 = MinHash(hashvalues=data_array)
    return m1
예제 #13
0
def MinHashFunc(df, num_perm, stop_words):
    """
    Take in dataframe with relevant columns to append and hash row based on these columns
    Return: dictionary of MinHash objects for all rows
    """
    row_hash = {}
    stemmer = WordNetLemmatizer()
    # Iterate through rows in table and min hash each row, then return the dictionary of all rows and MinHash objects
    for i, row in df.iterrows():
        row_hash[i] = MinHash(num_perm=num_perm)
        split_name = re.sub('[^A-Za-z0-9]+', ' ', row['name'].lower()).split()
        stop_words_remove = [w for w in split_name if not w in stop_words]
        name_stem = [stemmer.lemmatize(w) for w in stop_words_remove]
        name_comb = [''.join(w) for w in list(combinations(name_stem, 2))]
        split_add = re.sub('[^A-Za-z0-9]+', ' ',
                           row['street_address'].lower()).split()
        row_values = name_stem + name_comb + split_add + [row['phone']] * 2 + [
            row['postal_code']
        ]
        #        print(row_values)
        for j in row_values:
            try:
                row_hash[i].update(j.encode('utf8'))
            except AttributeError:
                continue
    return row_hash
예제 #14
0
def refer_query(lsh_forest, lsh, reference, grams):
    minhash = MinHash(num_perm=4000)
    reference = reference.replace(' ', '')
    for d in ngrams(reference, 3):
        minhash.update(''.join(d))

    query_result = lsh_forest.query(minhash, 1)
    query_result_thr = lsh.query(minhash)

    if query_result and query_result_thr:
        result = grams[query_result[0]]
        result_similar = [grams[item] for item in query_result_thr]
        if result in result_similar:
            return result
    else:
        return False
예제 #15
0
    def test_deserialize_byteorder(self):
        for byteorder in "@=<>!":
            m1 = MinHash(10, 1, hashfunc=fake_hash_func)
            m1.update(123)
            lm1 = LeanMinHash(m1)
            buf = bytearray(lm1.bytesize(byteorder))
            lm1.serialize(buf, byteorder)

            # Test if we get back the exact same LeanMinHash objects after
            # deserializing from bytes
            lm1d = LeanMinHash.deserialize(buf, byteorder)
            self.assertEqual(lm1d.seed, lm1.seed)
            self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues))
            self.assertTrue(
                all(hvd == hv
                    for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
예제 #16
0
파일: client.py 프로젝트: Steap/SIXEcho
 def __init__(self, api_key=None, host_url=None, max_workers=1):
     """
     Initial sixecho
     Attributes:
         api_key(string)       - Optional : api_key generate from sixecho
         host_url(string)      - Optional : is sixecho domain
     """
     self.api_key = api_key
     deepcut.tokenize("Welcome")  # Load library
     if host_url is not None:
         if host_url.endswith("/"):
             host_url = host_url[:-1]
         self.host_url = host_url
     self.array_words = []
     self.min_hash = MinHash(num_perm=128)
     self.max_workers = max_workers
     self.sha256 = ""
예제 #17
0
    def __init__(
        self,
        feature_length: int = None,
        config: Optional[PradoProjectorConfig] = None,
    ):
        super().__init__()

        if config is None:
            config = PradoProjectorConfig(feature_length=feature_length)

        self._config = copy.deepcopy(config)
        self._hashobj = MinHash(num_perm=self.n_permutations,
                                hashfunc=farmhash.hash32)
        self._projection_operator = PradoProjectionOperator()

        self._vectorized_projection = np.vectorize(self.project,
                                                   signature="()->(n)")
예제 #18
0
def lsh_predict_label(stems):
    '''
    Queries the LSH matcher and returns:
        0 if predicted spam
        1 if predicted ham
       -1 if parsing error
    '''
    minhash = MinHash(num_perm=128)
    if len(stems) < 2:
        return -1
    for s in stems:
        minhash.update(s.encode('utf-8'))
    matches = lsh.query(minhash)
    if matches:
        return 0
    else:
        return 1
예제 #19
0
def lsh_predict_label(stems, lsh):
    '''
    LSH 매처의 반환값:
        0 스팸
        1 햄
       -1 에러
    '''
    minhash = MinHash(num_perm=128)
    if len(stems) < 2:
        return -1
    for s in stems:
        minhash.update(s.encode('utf-8'))
    matches = lsh.query(minhash)
    if matches:
        return 0
    else:
        return 1
예제 #20
0
def predict(tokens, database, perms, num_results, forest):
    start_time = time.time()

    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf-8'))

        idx_array = np.array(forest.query(m, num_results))
        if len(idx_array) == 0:
            return None

        # print(idx_array)
        # result=database[idx_array]

        print('took % seconds to query forest' % (time.time() - start_time))

        return idx_array
예제 #21
0
def find_relation_class_name_matchings(network, kr_handlers):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_sources = []
    for (db_name, source_name, _, _) in network.iterate_values():
        original_source_name = source_name
        if source_name not in seen_sources:
            seen_sources.append(source_name)  # seen already
            source_name = nlp.camelcase_to_snakecase(source_name)
            source_name = source_name.replace('-', ' ')
            source_name = source_name.replace('_', ' ')
            source_name = source_name.lower()
            m = MinHash(num_perm=32)
            for token in source_name.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('relation', (db_name, original_source_name), m))

    num_relations_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            m = MinHash(num_perm=32)
            for token in cl.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('class', (kr_name, original_cl_name), m))

    # Index all the minhashes
    lsh_index = MinHashLSH(threshold=0.5, num_perm=32)

    for idx in range(len(names)):
        lsh_index.insert(idx, names[idx][2])

    matchings = []
    for idx in range(0, num_relations_inserted):  # Compare only with classes
        N = lsh_index.query(names[idx][2])
        for n in N:
            kind_q = names[idx][0]
            kind_n = names[n][0]
            if kind_n != kind_q:
                # match.format is db_name, source_name, field_name -> class_name
                match = ((names[idx][1][0], names[idx][1][1], "_"),
                         names[n][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (name): " + str(et - st))
    return matchings
예제 #22
0
def predict(text, database, perms, num_results, forest):
    start_time = time.time()

    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))

    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None  # if your query is empty, return none

    result = database.iloc[idx_array]

    print('It took %s seconds to query forest.' % (time.time()-start_time))

    return result
예제 #23
0
    def _index_records(self, records):
        """
          Constructs Minhash LSH buckets for a given set of records

          Args:
            records (dict) : dict of (record_id -> record_value)

          Returns:
            None
        """
        indexer = defaultdict(list)

        # Create minhashes
        minhashes = {}
        for rid in records:
            m = MinHash(num_perm=self._num_perm)
            for d in records[rid]:
                qgrams = set(self.nt.basic(d, 2))
                for gram in qgrams:
                    m.update(gram.encode('utf-8'))
            minhashes[rid] = m

        # Create LSH instance and add min hashes
        if self._bands == MinHashLSHRecordDeduplication.BANDS and self._rows == MinHashLSHRecordDeduplication.ROWS:
            lsh = MinHashLSH(threshold=self._threshold,
                             num_perm=self._num_perm)
        else:
            lsh = MinHashLSH(num_perm=self._num_perm,
                             params=(self._bands, self._rows))

        max_blocks = []
        for rid in records:
            lsh.insert(rid, minhashes[rid])
            max_blocks.append(rid)

        # Generate blocks
        while (len(max_blocks) > 0):
            key = max_blocks[0]
            bucket = lsh.query(minhashes[key])
            for rid in bucket:
                if rid in max_blocks:
                    max_blocks.remove(rid)
                indexer["b" + str(self._block_index)].append(rid)
            self._block_index += 1

        self._write_indexer(indexer)
예제 #24
0
def retrieve_class_names(kr_handlers, num_perm=32):
    names = list()

    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            m = MinHash(num_perm=num_perm)
            for token in cl.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('class', (kr_name, original_cl_name), m))
    return names
예제 #25
0
def main() -> None:
    minhashes = []
    files = []
    for iterator in tqdm(range(config.COUNT_UNQ_MHS),
                         desc="Generate minHashes:"):
        minhash = MinHash(num_perm=256)
        file = []
        for _ in range(200):
            rand_string = ''.join(
                random.choice(string.ascii_lowercase) for _ in range(5))
            file.append(rand_string)
        files.append(file)
        minhash.update_batch([s.encode('utf-8') for s in file])
        minhashes.append(("key" + str(iterator), minhash))

    lsh = MinHashLSH(threshold=0.5,
                     num_perm=256,
                     storage_config={
                         'type': 'cassandra',
                         'basename': b'perftest',
                         'cassandra': {
                             'seeds': ['127.0.0.1'],
                             'keyspace': config.KEY_SPACE,
                             'replication': {
                                 'class': 'SimpleStrategy',
                                 'replication_factor': '1',
                             },
                             'drop_keyspace': False,
                             'drop_tables': False,
                         }
                     })

    for _ in tqdm(range(1), desc="Insert 100 minHashes:"):
        with lsh.insertion_session(buffer_size=100) as session:
            for key, minhash in minhashes:
                session.insert(key, minhash)

    f_disc_mhs = open('minhashes.txt', 'w+')
    for minhash in tqdm(minhashes, desc="Log minHashes:"):
        log(f_disc_mhs, minhash[0], minhash[1].digest())
    f_disc_mhs.close()

    f_disc_files = open('files.txt', 'w+')
    for iterator in tqdm(range(len(files)), desc="Log files:"):
        log(f_disc_files, minhashes[iterator][0], files[iterator])
    f_disc_mhs.close()
    def __train_LSH(self,data):
        start_time = time.time()
        forest = MinHashLSHForest(num_perm=config.permutations)
        for item in tqdm(data, desc="MinHash Docs.."):
            tag = item['tag']
            tokens = item['data']

            if self.type == 'trigram':
                tokens = self.normalizer.generate_ngrams_char(tokens[0])
            m = MinHash(num_perm=config.permutations)
            for s in tokens:
                m.update(s.encode('utf8'))
            forest.add(tag,m)

        forest.index()
        print('It took %.2f seconds to build forest.' % (time.time() - start_time))
        return forest
예제 #27
0
 def test_update(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     try:
         lm1 = LeanMinHash(m1)
         lm1.update(12)
     except TypeError:
         pass
     else:
         raise Exception
예제 #28
0
 def tokenize_method(audio):
     feature = audio.features[feature_name]
     pace = 20
     offset = 70
     blocks = zip(*[feature[i:] for i in range(shingle_size)])
     prewords = [[
         chr(int(i * pace) + offset) for i in normalize(np.array(bl))
     ] for bl in blocks]
     a = []
     if use_minhash:
         for preword in prewords:
             m = MinHash(num_perm=min_hash_fns)
             m.update(' '.join(preword).encode('utf-8'))
             tx = ''.join([str(c) for c in m.hashvalues])
             a.append(hashlib.md5(tx.encode('utf-8')).hexdigest())
         return ' '.join(a)
     else:
         return ' '.join([''.join(p) for p in prewords])
예제 #29
0
def add_untopic_doc(in_path, file_id):
    js = json.load(codecs.open(in_path, "r"))
    line = js["content_full_text"]

    seg_list = basic_preprocess(line, "utf8", True, True)
    #seg_list = jieba.cut(line, cut_all = False)
    #seg_list = stemmer_by_porter(seg_list)

    no_list = []
    for word in seg_list:
        if (word not in stopword) and (len(word) > 1):
            no_list.append(word)

    mh = MinHash(num_perm=128)
    for word in no_list:
        mh.update(word.encode('utf8'))

    lsh.insert(file_id, mh)
예제 #30
0
def toBuildLSH(cleanSongs):
    '''
    :param cleanSongs
    :return: forest, min_hash_list
    '''
    forest = MinHashLSHForest(num_perm=128)
    min_hash_list = []
    for songIndex, song in enumerate(cleanSongs):
        minhash = MinHash(num_perm=128)
        for word in song:
            ### encoding each word
            minhash.update(word.encode('utf8'))
        ### add each song's minhash to the forest as well as min_hash_list
        forest.add(str(songIndex), minhash)
        min_hash_list.append(minhash)

    forest.index()
    return forest, min_hash_list
예제 #31
0
 def test_update(self):
     m1 = MinHash(4, 1, hashobj=FakeHash)
     try:
         lm1 = LeanMinHash(m1)
         lm1.update(12)
     except TypeError:
         pass
     else:
         raise Exception
예제 #32
0
    def _get_raw_class_matches(self, class_name, lsh):
        if not self._classes_signatures:
            self.get_classes_signatures()

        class_signatures = self._classes_signatures[class_name]

        if class_signatures:
            self._lsh_classes.update([class_name])

            m = MinHash(num_perm=config.LSH_PERM_NUM)
            for signature in class_signatures:
                m.update(signature.encode('utf8'))

            matches = lsh.query(m, len(class_signatures))

            return set(matches)
        else:
            return set()
예제 #33
0
def lsh_clustering(
    signatures: List[np.ndarray],
    threshold: float = 0.5,
    num_perm: int = 128,
):
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    with lsh.insertion_session() as session:
        for key, minhash in enumerate(signatures):
            session.insert(f"id-{key}",
                           MinHash(num_perm=num_perm, hashvalues=minhash))

    neighbors: List[List[int]] = []

    for key, minhash in enumerate(signatures):
        result = lsh.query(MinHash(num_perm=num_perm, hashvalues=minhash))
        neighbors.append([int(x.split("-")[1]) for x in result])

    return neighbors
예제 #34
0
    def extract_attribute(self, base_object: BDFunction) -> int:
        # Check if value already exists
        FunctionMinHashLSH_value = base_object.get_attribute_value('FunctionMinHashLSH')

        if FunctionMinHashLSH_value:
            pass
        else:
            normalized_instr_set: set = set(base_object.get_attribute_value('FunctionNormalized'))

            # Create MinHash object
            minhash = MinHash(num_perm=Configuration.MINHASH_PERMUTATIONS, seed=Configuration.MINHASH_SEED)
            for instr in normalized_instr_set:
                minhash.update(instr.encode('utf8'))

            base_object.add_attribute_value('FunctionMinHashLSH', {'function_lsh': minhash.digest()})
            FunctionMinHashLSH_value = base_object.get_attribute_value('FunctionMinHashLSH')

        return FunctionMinHashLSH_value['function_lsh'] if FunctionMinHashLSH_value else None
 def get_min_hash(self, x):
     """
     Create a MinHash object for the input example string
     using w-shingling.
     
     Parameters:
         x - A list of strings representing an example.
     
     Returns:
         A datasketch.MinHash object updated with
         the generated w-shingles.
     """
     min_hash = MinHash(num_perm=self.num_perm, seed=self.random_state)
     # we accumulate all shingles extracted from each string
     for x_str in x:
         # map string x_str to a set of shingles
         x_shingles = MinHashNearestNeighbor.get_w_shingles(x_str, self.w)
         for shingle in x_shingles:
             min_hash.update(shingle)
     return min_hash
    def _hello_world():
        """
        This fragment was taken from the datasketch github page:
        https://github.com/ekzhu/datasketch
        """
        data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
                'estimating', 'the', 'similarity', 'between', 'datasets']
        data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
                'estimating', 'the', 'similarity', 'between', 'documents']

        m1, m2 = MinHash(), MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
        print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #37
0
def minhash_str(in_str, perms, gram_sz):
	minhash = MinHash(num_perm=perms)
	for d in ngrams(in_str, gram_sz):
		minhash.update("".join(d).encode('utf-8'))
	return minhash
        if args.header:
            next(f)
        #TODO test robustness
        #mycorpus=[(i,set(line.encode('utf8', 'ignore').lower().split())) for i,line in enumerate(f)]
        mycorpus=[(i,set(line.lower().split())) for i,line in enumerate(f)]

    print(("--- %s seconds ---" % (time.time() - start_time)))

    print('Calculate minhash signatures')
    start_time = time.time()

    #prepare dictionary of hashes
    hashcorp=dict.fromkeys([tup[0] for tup in mycorpus])
    #compute hashes
    for key,doc in mycorpus:
        #compute minhash signature
        m=MinHash(num_perm=num_permutations)
        for token in doc: m.digest(sha1(token))
        hashcorp[key]=m
    print(("--- %s seconds ---" % (time.time() - start_time)))
    if num_processes> 1:
        if len(thresholds)<num_processes:
            num_processes=len(thresholds)
        p=Pool(num_processes)
        assignment=[ (x,) for x in thresholds]
        p.map(compute_clusters,assignment)
    else:
        for x in thresholds:
            compute_clusters((x,))

예제 #39
0
파일: program.py 프로젝트: livnatg/proj
newSentence = []
for i in range(num_sentences):
    newSentence.append(model.getSentence(word_to_index,index_to_word))

# print(len(newSentence))
# print (newSentence)
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
for sen in newSentence:
    data1 = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(sen) \
                    if token.lower().strip(string.punctuation) not in stopwords]
    f = open('data/data.csv', 'rb')
    for line in f:
        data2 = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(line) \
                        if token.lower().strip(string.punctuation) not in stopwords]
        m1, m2 = MinHash(), MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        # print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))

        if(actual_jaccard > 0.3):
            print("Actual Jaccard for data1 and data2 is", actual_jaccard)
            print sen
            print line
예제 #40
0
 def query(self, v, n):
     m = MinHash(num_perm = self._n_perm)
     for e in v:
         m.update(str(e))
     return map(int, self._index.query(m, n))
예제 #41
0
def get_min_hash(text, too_common, num_perm=128):
    min_hash = MinHash(num_perm=num_perm)
    for shingle_h in shingle_hashes(text):
        if shingle_h.hexdigest() not in too_common:
            min_hash.digest(shingle_h)
    return min_hash