예제 #1
0
 def fit(self, X):
     self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep)
     for i, x in enumerate(X):
         m = MinHash(num_perm = self._n_perm)
         for e in x:
             m.update(str(e))
         self._index.add(str(i), m)
     self._index.index()
예제 #2
0
def find_relation_class_name_matchings(network, kr_handlers):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_sources = []
    for (db_name, source_name, _, _) in network.iterate_values():
        original_source_name = source_name
        if source_name not in seen_sources:
            seen_sources.append(source_name)  # seen already
            source_name = nlp.camelcase_to_snakecase(source_name)
            source_name = source_name.replace('-', ' ')
            source_name = source_name.replace('_', ' ')
            source_name = source_name.lower()
            m = MinHash(num_perm=32)
            for token in source_name.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('relation', (db_name, original_source_name), m))

    num_relations_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            m = MinHash(num_perm=32)
            for token in cl.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('class', (kr_name, original_cl_name), m))

    # Index all the minhashes
    lsh_index = MinHashLSH(threshold=0.5, num_perm=32)

    for idx in range(len(names)):
        lsh_index.insert(idx, names[idx][2])

    matchings = []
    for idx in range(0, num_relations_inserted):  # Compare only with classes
        N = lsh_index.query(names[idx][2])
        for n in N:
            kind_q = names[idx][0]
            kind_n = names[n][0]
            if kind_n != kind_q:
                # match.format is db_name, source_name, field_name -> class_name
                match = ((names[idx][1][0], names[idx][1][1], "_"),
                         names[n][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (name): " + str(et - st))
    return matchings
예제 #3
0
    def embed(self, corpus: List[str]) -> np.ndarray:

        signatures: List[np.ndarray] = []
        for doc in corpus:
            m = MinHash(num_perm=self.num_perm)
            for ngram in ngrams(doc, self.ngram_size):
                m.update(" ".join(ngram).encode("utf-8"))
            signatures.append(m.hashvalues)

        return np.asarray(signatures)
예제 #4
0
파일: MinHash.py 프로젝트: hokaii/CodeShow
def calculate_jaccard(text1, text2):  # 计算两行文本jaccard相似度
    minihash1, minihash2 = MinHash(), MinHash()

    for word in text1:
        minihash1.update(word.encode('utf-8'))

    for word in text2:
        minihash2.update(word.encode('utf-8'))

    return minihash1.jaccard(minihash2)
예제 #5
0
def hashing():
    m = []
    for index in df.index:
        t = create_shingles(df.at[index, 'urlDrugName'])
        mh = MinHash(num_perm=256)
        for d in t:
            mh.update(d.encode('utf8'))
        m.append(mh)
        del mh
    return m
예제 #6
0
def compute_minhash(column):
    permutations = config.MINHASH_PARAMS['num_permutations']
    encoding = config.MINHASH_PARAMS['encoding']
    minhash = MinHash(num_perm=permutations)

    for elem in column:
        minhash.update(elem.encode(encoding))
    hash = minhash

    return hash
예제 #7
0
def get_min_hash(shingles: set) -> MinHash:
    """
    given a set of shingles, creates a MinHash object updated with those shingles.
    :param shingles: a set of track shingles.
    :return: a MinHash object updated with the given shingles.
    """
    track_min_hash = MinHash(num_perm=128)
    for shin in shingles:
        track_min_hash.update(str(shin).encode('utf-8'))
    return track_min_hash
예제 #8
0
def DIDsampling(dataset,BF,username,userid,attra_id,beta,clustdict):
    # dataset = Cora_labeled.objects.all()
    # clustdict = dextrapreclustering.minhashPreClustering(dataset)
    cluster_membership = {}

    # values = models.sigirCoraAttrValue.objects.filter(attr_id=attra_id)
    # attrasynonyms = models.sigirCoraValueSynonym.objects.filter(value_id__in=[ value.id for value in values])
    # record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym_id__in=[ syn.id for syn in attrasynonyms])
    record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym__value__attr_id=attra_id)
    record_noAttra = dataset.exclude(id__in = [ item.cora_id for item in record_hasAttra])




    for k, v in clustdict.items():
        for d in v:
            cluster_membership[d] = k

    sum = 0.000001
    for record in BF:
        sum = sum + len(clustdict[cluster_membership[record.id]])
    for record in BF:
        # AC
        cora2ae = models.sigirCoraToAttrEntity.objects.filter(cora_id=record.id,user=username)
        if cora2ae:
            list = [ item.attrsynonym.value.attr.id for item in cora2ae]
            if attra_id in list:
                record.orderscore = 0
                record.save()
                continue
            else:
                ac = 1-len(list)/models.sigirCoraAttr.objects.filter(userid=userid).count()
        else:
            ac = 1

        # distribution on dataset
        k = cluster_membership[record.id]
        ic = len(clustdict[k])/record_noAttra.count()
        record_minhash = MinHash(num_perm=128)
        s = set(record.cleantext.split(" "))
        for d in s:
            record_minhash.update(d.encode('utf8'))
        term2sum = 0
        for rr in BF:
            rr_minhash =  MinHash(num_perm=128)
            ss = set(rr.cleantext.split(" "))
            for dd in ss:
                rr_minhash.update(dd.encode('utf8'))
            sim = record_minhash.jaccard(rr_minhash)
            sim = (sim/sum)**beta
            term2sum = term2sum + sim
        did = ac*ic*term2sum
        record.orderscore = did
        record.save()
    return BF
예제 #9
0
 def test_count(self):
     m = MinHash(hashobj=FakeHash)
     m.update(11)
     m.update(123)
     m.update(92)
     m.update(98)
     m.update(123218)
     m.update(32)
     lm = LeanMinHash(m)
     c = lm.count()
     self.assertGreaterEqual(c, 0)
예제 #10
0
    def _get_min_hash(self, r):
        """

        :param r:   The incoming row
        :return:    resulting min hash
        """
        cset = self._get_shingle(r)
        m = MinHash(self.__num_perm)
        for c in cset:
            m.update(c.encode('utf-8'))
        return m
예제 #11
0
def similar_videos_from_forest(inferred_label_probabilities):
    inferred_labels_full = convert_inferred_labels_to_list(
        inferred_label_probabilities)
    minhash = MinHash(num_perm=128)
    for label in inferred_labels_full:
        minhash.update(label)

    if forest == None:
        load_forest()

    return forest.query(minhash, 10)
예제 #12
0
def mh_digest(data):
    """
    create a MinHash digest
    """
    num_perm = 512
    m = MinHash(num_perm)

    for d in data:
        m.update(d.encode('utf8'))

    return m
def get_codes(text_val, nbit=128):
    '''
    Converts the text data to a binary code.
    :param text_val: The text data to be converted to a minhash binary code.
    :param nbit: The size of the binary code to be returned.
    :return: A binary code representation of the text string.
    '''
    minhash = MinHash(num_perm=nbit)
    for c, i in enumerate(text_val):
        minhash.update("".join(i))
    return minhash
예제 #14
0
def minhash_implem(url_shingles_list):
    list_url_hash = []
    for url in range(len(url_shingles_list)):
        m = MinHash(num_perm=8)
        shingle_list = url_shingles_list[url][1]
        for shingle in shingle_list:
            m.update(shingle.encode('utf8'))
        list_url_hash.append(
            ["{0}".format(url_shingles_list[url][0]),
             m.digest()])
    return list_url_hash
예제 #15
0
 def test_count(self):
     m = MinHash(hashfunc=fake_hash_func)
     m.update(11)
     m.update(123)
     m.update(92)
     m.update(98)
     m.update(123218)
     m.update(32)
     lm = LeanMinHash(m)
     c = lm.count()
     self.assertGreaterEqual(c, 0)
예제 #16
0
 def make_lsh(self,shingle_length=2,threshold=0.8):
     print(f'Making LSH with threshold of {threshold}, shingle length of {shingle_length}')
     sets = self.make_shingle_sets(self.indoc,shingle_length)
     self.minhashes = {}
     self.lsh = MinHashLSH(threshold=threshold, num_perm=128)
     for k in sets.keys():
         m = MinHash(num_perm=128)
         for item in sets[k]:
             m.update(item.encode('utf8'))
             self.minhashes[k] = m
         self.lsh.insert(k,m)
예제 #17
0
def apply_lsh(group, col):
    lsh = MinHashLSH(threshold=0.9, num_perm=256)
    minhashes = {}
    for idx, text in group[col].iteritems():
        minhash = MinHash(num_perm=256)
        for d in ngrams(text, 3):
            minhash.update("".join(d).encode('utf-8'))
        index = group.loc[idx, 'productId']
        lsh.insert(key=index, minhash=minhash)
        minhashes[index] = minhash
    return lsh, minhashes
예제 #18
0
def get_hashbands(window):
    minhash = MinHash(num_perm=config['n_permutations'], seed=1)
    for ngram in set(ngrams(' '.join(window), 3)):
        minhash.update(''.join(ngram).encode('utf8'))
    hashband_vals = []
    for c, i in enumerate(minhash.hashvalues):
        hashband_vals.append(i)
        if len(hashband_vals) == config['hashband_length']:
            hashband = '.'.join([str(j) for j in hashband_vals])
            hashband_vals = []
            yield hashband
예제 #19
0
def get_jaccard_index(sequence1, sequence2, k):
    seq1_minHash, seq2_minHash = MinHash(), MinHash()
    seq1_kmers = count_kmers(sequence1, k)
    seq2_kmers = count_kmers(sequence2, k)
    seq1_keys = list(seq1_kmers.keys())
    seq2_keys = list(seq2_kmers.keys())
    for key in seq1_keys:
        seq1_minHash.update(key.encode('utf8'))
    for key in seq2_keys:
        seq2_minHash.update(key.encode('utf8'))
    return seq1_minHash.jaccard(seq2_minHash)
def calculate_lsh(text, mode, lsh_type):
    """Calculate LSH for the file"""
    min_hash = MinHash(num_perm=128)
    if mode == 'char':
        for d in ngrams(text, 3):
            min_hash.update("".join(d).encode('utf-8'))
    elif mode == 'word':
        for d in ngrams(text, 3):
            min_hash.update(" ".join(d).encode('utf-8'))
    result = lsh_type.query(min_hash)
    return result
def predict(text, database, perms, num_results, forest):
    #get top results for LSH forest
    text_preprocessed = preprocess(text)
    m = MinHash(num_perm=perms)
    for d in ngrams(text_preprocessed, 3):
        m.update("".join(d).encode('utf-8'))
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None  # if your query is empty, return none

    result = database.iloc[idx_array]['wikidata_numeric_id'].astype(int)
    return result
예제 #22
0
def print_minhash_to_pickle(from_k, to_k, sequence_dict):
    for kk in range(from_k, to_k):
        print(kk)
        accession_number_dict = dict()
        for key, value in sequence_dict.items():
            minHash = MinHash()
            kmer = count_kmers(value[1], kk)
            for kmer_key in kmer.keys():
                minHash.update(kmer_key.encode('utf8'))
            accession_number_dict[key] = minHash
        filepath = get_minhash_pickle_filename(kk, basePath)
        pickle.dump(accession_number_dict, open(filepath, 'wb'))
예제 #23
0
def minhash(x, y):
    m1, m2 = MinHash(), MinHash()

    s1 = extract_keywords(x)
    s2 = extract_keywords(y)

    for data in s1:
        m1.update(data.encode('utf8'))
    for data in s2:
        m2.update(data.encode('utf8'))

    return m1.jaccard(m2)
예제 #24
0
 def test_jaccard(self):
     m1 = MinHash(4, 1, hashobj=FakeHash)
     m2 = MinHash(4, 1, hashobj=FakeHash)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 1.0)
     m2.update(12)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 0.0)
     m1.update(13)
     lm1 = LeanMinHash(m1)
     self.assertTrue(lm1.jaccard(lm2) < 1.0)
예제 #25
0
def create_LSH_Forest():
    global forest
    if os.path.isfile(LSH_FOREST_FILE):
        load_forest()
    else:
        forest = MinHashLSHForest(num_perm=128)
    train_records = glob.glob("dataset/train*.tfrecord")
    validate_records = glob.glob("dataset/validate*.tfrecord")
    all_records = train_records + validate_records
    dataset = tf.data.TFRecordDataset(all_records)
    iterator = dataset.make_one_shot_iterator()
    count = 0
    next_element = iterator.get_next()
    updated = False
    with tf.Session() as sess:
        try:
            while True:
                if count % 10000 == 0:
                    print "[SimpleVideoSearch][{}] Processed {} records from the dataset so far".format(
                        datetime.now(), count)
                if updated and count % 100000 == 0:
                    with open(LSH_FOREST_FILE, 'wb') as forest_file:
                        forest.index()
                        pickle.dump(forest, forest_file,
                                    pickle.HIGHEST_PROTOCOL)
                    print "[SimpleVideoSearch][{}] Updated LSH Forest file".format(
                        datetime.now(), count)
                exampleBinaryString = sess.run(next_element)
                example = tf.train.Example.FromString(exampleBinaryString)
                count += 1
                example_id = example.features.feature["id"].bytes_list.value[0]
                if example_id not in forest:
                    if not updated:
                        updated = True
                        print '[SimpleVideoSearch][{}] First update at record {}'.format(
                            datetime.now(), count)
                    dataset_labels_full = convert_dataset_labels_to_list(
                        example.features.feature["labels"].int64_list.value)
                    minhash = MinHash(num_perm=128)
                    for label in dataset_labels_full:
                        minhash.update(label)
                    forest.add(example_id, minhash)
        except tf.errors.OutOfRangeError:
            print "[SimpleVideoSearch][{}] Done iterating through dataset".format(
                datetime.now())
        finally:
            print "[SimpleVideoSearch][{}] Processed {} records from the dataset".format(
                datetime.now(), count)
            forest.index()
            with open(LSH_FOREST_FILE, 'wb') as forest_file:
                pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL)
            print "[SimpleVideoSearch][{}] Finished creating LSH Forest file".format(
                datetime.now(), count)
예제 #26
0
def xiaoming(lines, length_of_f, i):
    first_user = lines[i]
    a = first_user.split()
    user_ID = a[0] + '    ' + '\n'
    print(a[1])
    hash_value = [0 for i in range(0, 51)]  #该列表用于写入文件
    sort_value = [0 for i in range(0, 51)]  #该列表用于放置更新中的值
    # 创建一个该用户文件存储该用户与其他用户的minhash值
    #f_hash  = open('user_minhash/'+a[0]+'.txt',mode = 'w',encoding = 'utf-8')
    # f_hash.write(a[0] + '    '+'\n')  # 写入该用户的ID
    for j in range(length_of_f - 1):
        if (i == j):
            continue
        else:
            second_user = lines[j]

            b = second_user.split()
            data1 = a[2].split(',')
            data2 = b[2].split(',')

            #若果两列表相同个数为0或者一,舍去
            length_of_data1_and_data2 = len(data1) + len(data2)
            #去重
            both_data1_2 = list(set(data1 + data2))

            if ((length_of_data1_and_data2 - len(both_data1_2)) > 20):
                m1, m2 = MinHash(), MinHash()
                for d in data1:
                    m1.update(d.encode('utf8'))
                for d in data2:
                    m2.update(d.encode('utf8'))
                #print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))
                if (m1.jaccard(m2) <= 0.0078125):
                    continue
                else:
                    c = [b[0], m1.jaccard(m2)]
                    for m in range(49, -1, -1):
                        if (sort_value[m] == 0):
                            sort_value[m] = c
                        else:
                            if (c[1] > sort_value[m][1]):
                                sort_value[m + 1] = sort_value[m]
                                sort_value[m] = c
                            else:
                                break
            else:
                continue
                #new_value = b[0] + '    ' +str(m1.jaccard(m2)) + '\n'
                #hash_value.append(new_value)

    return sort_value
    #f_hash.write(b[0] + '    ' +str(m1.jaccard(m2)) + '\n')
    """ for item in sort_value:
예제 #27
0
	def calculateLSH(self,data,n):
		# Create MinHash objects
		print("Initializing the LSH for %i samples" % len(data))
		for c, i in enumerate(data):
			print("data" + str(i[0]))
			minhash = MinHash(num_perm=self.num_perm)
			for d in ngrams(i[1], n):
				minhash.update("".join(d).encode('utf-8'))
			self.lsh.insert(i[0], minhash)
			self.minhashes[i[0]] = minhash
		print(self.minhashes.keys())
		return self.minhashes, self.lsh
예제 #28
0
    def get_min_hash(self, title, blurb):
        """
        this method generates min hash
        """
        signature = MinHash()
        filter_stopwords = lambda x: x not in self.stopwords
        contents = self.strip_puncs(title.lower() + " " + blurb.lower()).split()
        cleaned_contents = set(list(filter(filter_stopwords, contents)))

        for current in cleaned_contents:
            signature.update(current.encode('utf-8'))
        return signature
예제 #29
0
파일: utils.py 프로젝트: Xuzhiqian/Demo
def getTextHashValues(text, ngram=5):
    """
    利用datasketch和NLTK得到文本的Hash值
    :param text: 一段字符串文本
    :param ngram: n-gram的值,短文本推荐为5
    :return: numpy.ndarray datasketch 计算得到的Hash值
    """
    assert type(text) == str, 'INPUT text should be str'
    m = MinHash()
    for i in nltk.ngrams(text, ngram):
        m.update(''.join(i).encode('utf8'))
    return m.hashvalues
예제 #30
0
    def minhash_from_string(self, input_string):
        """
        Generates minhash object from string
        """
        shingles = self.extract_shingles(input_string)
        shingle_set = set(shingles)

        m = MinHash(num_perm=self.num_perm)
        for i in shingle_set:
            m.update(i.encode('utf8'))

        return m
예제 #31
0
 def test_jaccard(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     m2 = MinHash(4, 1, hashfunc=fake_hash_func)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 1.0)
     m2.update(12)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 0.0)
     m1.update(13)
     lm1 = LeanMinHash(m1)
     self.assertTrue(lm1.jaccard(lm2) < 1.0)
예제 #32
0
 def minhash_tweet(self, tweet_text):
     """Minhashing operation that allows for a caching of up to
     1M tweets in order to speed up the checking procedure when it's
     the same tweet text"""
     tweet_hash = MinHash(num_perm=self.permutations)
     for word in tweet_text.split():
         tweet_hash.update(
             self.punct.sub(
                 "",
                 word.encode('utf8')
             )
         )
     return tweet_hash
 def get_min_hash(self, x):
     """
     Create a MinHash object for the input example string
     using w-shingling.
     
     Parameters:
         x - A list of strings representing an example.
     
     Returns:
         A datasketch.MinHash object updated with
         the generated w-shingles.
     """
     min_hash = MinHash(num_perm=self.num_perm, seed=self.random_state)
     # we accumulate all shingles extracted from each string
     for x_str in x:
         # map string x_str to a set of shingles
         x_shingles = MinHashNearestNeighbor.get_w_shingles(x_str, self.w)
         for shingle in x_shingles:
             min_hash.update(shingle)
     return min_hash
    def _hello_world():
        """
        This fragment was taken from the datasketch github page:
        https://github.com/ekzhu/datasketch
        """
        data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
                'estimating', 'the', 'similarity', 'between', 'datasets']
        data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
                'estimating', 'the', 'similarity', 'between', 'documents']

        m1, m2 = MinHash(), MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
        print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #35
0
 def query(self, v, n):
     m = MinHash(num_perm = self._n_perm)
     for e in v:
         m.update(str(e))
     return map(int, self._index.query(m, n))
예제 #36
0
def minhash_str(in_str, perms, gram_sz):
	minhash = MinHash(num_perm=perms)
	for d in ngrams(in_str, gram_sz):
		minhash.update("".join(d).encode('utf-8'))
	return minhash
예제 #37
0
파일: program.py 프로젝트: livnatg/proj
newSentence = []
for i in range(num_sentences):
    newSentence.append(model.getSentence(word_to_index,index_to_word))

# print(len(newSentence))
# print (newSentence)
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
for sen in newSentence:
    data1 = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(sen) \
                    if token.lower().strip(string.punctuation) not in stopwords]
    f = open('data/data.csv', 'rb')
    for line in f:
        data2 = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(line) \
                        if token.lower().strip(string.punctuation) not in stopwords]
        m1, m2 = MinHash(), MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        # print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))

        if(actual_jaccard > 0.3):
            print("Actual Jaccard for data1 and data2 is", actual_jaccard)
            print sen
            print line