def constructNNList(self, studentSets, sampledSets, studentData, sampledData):
		print('Creating min-hashes for student data')
		self.studentMinHashes = self.createMinHash(studentSets)
		print('Creating min-hashes for rubric data')
		self.sampledMinHashes = self.createMinHash(sampledSets)

		self.forest = MinHashLSHForest(num_perm = 128)
		i = 0
		for minHash in self.sampledMinHashes:
			self.forest.add(str(i), minHash)
			i += 1

		self.forest.index()

		print("calculating nearest neighbor")
		scores = []
		for i, query in enumerate(tqdm(self.studentMinHashes)):
			result = self.forest.query(query, 1)
			indexMatch = int(result[0])
			# Uncomment these to print examples of 
			# student code and their nearest neighbor!
			print(result)
			print('Student Code: \n')
			print(studentData[i])
			print('\n')
			print('Closest Sampled Code: \n')
			print(sampledData[indexMatch])
			print('\n')
			score = self.sampledMinHashes[indexMatch].jaccard(query)
			print('Score: \n')

			scores.append(score)

		return scores
 def fit(self, X):
     self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)
     for i, x in enumerate(X):
         m = MinHash(num_perm=self._n_perm)
         for e in x:
             m.update(str(e).encode('utf8'))
         self._index.add(str(i), m)
     self._index.index()
Пример #3
0
 def __init__(self, num_permutation=60):
     self.__num_permutation = num_permutation
     self.__forest = MinHashLSHForest(self.__num_permutation)
     self.__lem = WordNetLemmatizer()
     stop_words = set(stopwords.words("english"))
     stop_words.add('—')
     stop_words.add('And')
     self.__stop_words = stop_words
Пример #4
0
    def clustering(self, data_tag):
        """
    Params:
      :data_tag: Whether it's source or target data
    """

        # create a min hash forest to quickly find nearest neighbours
        self.forest = MinHashLSHForest(num_perm=self.num_perm)

        # initialize clusters
        medoids = random.sample(range(len(self.data_points[data_tag])),
                                self.num_clusters[data_tag])

        for i in range(self.num_clusters[data_tag]):
            cl = self.ClusterClass(self.data_points[data_tag][medoids[i]])
            self.clusters[data_tag].append(cl)
            # put medoids in a the forest
            self.forest.add(i, self.clusters[data_tag][-1].medoid.min_hash)
        self.forest.index()

        # for each data_point find a cluster
        self.cluster_points(data_tag)

        # these will be needed for the stopping criterion
        cluster_names = [
            self.clusters[data_tag][i].medoid.string
            for i in range(self.num_clusters[data_tag])
        ]
        cluster_names_old = list(cluster_names)
        count = 0
        counts = []
        exit = False

        # clustering loop
        while not exit:
            count += 1

            # find the point that minimizes the mean distance within a cluster
            self.find_medoid(data_tag)

            # create new forest
            self.forest = MinHashLSHForest(num_perm=self.num_perm)
            for i in range(self.num_clusters[data_tag]):
                self.forest.add(i, self.clusters[data_tag][i].medoid.min_hash)
            self.forest.index()

            # assign each point to the new medoids
            self.cluster_points(data_tag)

            # check stopping criterions
            exit, cluster_names, cluster_names_old, counts = \
              self.stop_clustering(data_tag,
                                   cluster_names,
                                   cluster_names_old,
                                   count,
                                   counts)
Пример #5
0
 def fit(self, X):
     self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep)
     for i, x in enumerate(X):
         m = MinHash(num_perm = self._n_perm)
         for e in x:
             m.update(str(e))
         self._index.add(str(i), m)
     self._index.index()
Пример #6
0
def search_lshforest_jaccard_topk(index_data, query_data, b, r, k):
    (index_sets, index_keys, index_minhashes) = index_data
    (query_sets, query_keys, query_minhashes) = query_data
    num_perm = b * r
    print("Building LSH Forest Index.")
    start = time.perf_counter()
    index = MinHashLSHForest(num_perm=num_perm, l=b)
    # Use the indices of the indexed sets as keys in LSH.
    for i in range(len(index_keys)):
        index.add(i, index_minhashes[num_perm][i])
    index.index()
    end = time.perf_counter()
    print("Indexing time: {:.3f}.".format(end - start))
    print("Querying.")
    times = []
    results = []
    for query_minhash, query_key, query_set in \
            zip(query_minhashes[num_perm], query_keys, query_sets):
        start = time.perf_counter()
        result = index.query(query_minhash, k * 2)
        # Recover the retrieved indexed sets and
        # compute the exact Jaccard similarities.
        result = [[index_keys[i],
                   compute_jaccard(query_set, index_sets[i])] for i in result]
        # Sort by similarity.
        result.sort(key=lambda x: x[1], reverse=True)
        # Take the top k.
        result = result[:k]
        duration = time.perf_counter() - start
        times.append(duration)
        results.append((query_key, result))
        sys.stdout.write(f"\rQueried {len(results)} sets")
    sys.stdout.write("\n")
    return (results, times)
Пример #7
0
def mylshforest(corpus):
    #print(len(corpus))
    forest = MinHashLSHForest(num_perm=32)
    score_res = [0]
    mh = []
    for i in range(len(corpus) - 1):
        doc = corpus[i]
        doc2 = corpus[i + 1]
        m = MinHash(num_perm=32)
        for d in doc:
            m.update(d.encode('utf8'))
        forest.add(str(i), m)
        forest.index()
        mh.append(m)

        m2 = MinHash(num_perm=32)
        for d in doc2:
            m2.update(d.encode('utf8'))
        result = forest.query(m2, 10)
        score = 0.0
        for j in range(len(result)):
            score = score + m2.jaccard(mh[int(result[j])])
        if (len(result) > 0):
            score = score / len(result)
        score_res.append(score)
        i = i + 1
    return score_res
Пример #8
0
	def create_lsh_forest(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_forest.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			# load precomputed
			print('Loading cached forest')
			self.forest = load_pickle(cache_file)
		else:
			sampledSets = self.processData(self.sampledData)
			self.sampledMinHashes = self.createMinHashSet(sampledSets)

			self.forest = MinHashLSHForest(num_perm=self.num_perm)
			for prog_idx, minHash in enumerate(self.sampledMinHashes):
				self.forest.add(prog_idx, minHash)

			self.forest.index()

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_pickle(self.forest, cache_file)
Пример #9
0
    def fit(self, X):
        self.index = numpy.empty([0, 32])
        self._index_minhash = []
        self._ball_index = []
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)

        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e).encode('utf-8'))
            self._index.add(str(i), m)
            #self.index.append(m.digest())
            self.index = numpy.vstack((self.index, m.digest()))
            self._ball_index.append(m.digest())
            self._index_minhash.append(m)
        self._index.index()
        self._X = X

        self.tree = BallTree(self.index, leaf_size=self._n_leaves)
Пример #10
0
def create_LSH_Forest():
    global forest
    if os.path.isfile(LSH_FOREST_FILE):
        load_forest()
    else:
        forest = MinHashLSHForest(num_perm=128)
    train_records = glob.glob("dataset/train*.tfrecord")
    validate_records = glob.glob("dataset/validate*.tfrecord")
    all_records = train_records + validate_records
    dataset = tf.data.TFRecordDataset(all_records)
    iterator = dataset.make_one_shot_iterator()
    count = 0
    next_element = iterator.get_next()
    updated = False
    with tf.Session() as sess:
        try:
            while True:
                if count % 10000 == 0:
                    print "[SimpleVideoSearch][{}] Processed {} records from the dataset so far".format(
                        datetime.now(), count)
                if updated and count % 100000 == 0:
                    with open(LSH_FOREST_FILE, 'wb') as forest_file:
                        forest.index()
                        pickle.dump(forest, forest_file,
                                    pickle.HIGHEST_PROTOCOL)
                    print "[SimpleVideoSearch][{}] Updated LSH Forest file".format(
                        datetime.now(), count)
                exampleBinaryString = sess.run(next_element)
                example = tf.train.Example.FromString(exampleBinaryString)
                count += 1
                example_id = example.features.feature["id"].bytes_list.value[0]
                if example_id not in forest:
                    if not updated:
                        updated = True
                        print '[SimpleVideoSearch][{}] First update at record {}'.format(
                            datetime.now(), count)
                    dataset_labels_full = convert_dataset_labels_to_list(
                        example.features.feature["labels"].int64_list.value)
                    minhash = MinHash(num_perm=128)
                    for label in dataset_labels_full:
                        minhash.update(label)
                    forest.add(example_id, minhash)
        except tf.errors.OutOfRangeError:
            print "[SimpleVideoSearch][{}] Done iterating through dataset".format(
                datetime.now())
        finally:
            print "[SimpleVideoSearch][{}] Processed {} records from the dataset".format(
                datetime.now(), count)
            forest.index()
            with open(LSH_FOREST_FILE, 'wb') as forest_file:
                pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL)
            print "[SimpleVideoSearch][{}] Finished creating LSH Forest file".format(
                datetime.now(), count)
Пример #11
0
    def lsh_forest(self,
                   algoritm_type=None,
                   use_components=None,
                   type_option=None,
                   n_char=None,
                   n_word=None):
        """
        LSH Function.
        
        Parameters
        --------
        use_components: list, optional: ['name', 'addr'] or ['name'] or ['addr'].
            Components to use.
        type_option: list, optional: ['char', 'word'] or ['char'] or ['word'].
            Components to use.
        n_char: list of int
            sizes of char grams. 
        n_word: list of int
            sizes of word grams.
        algoritm_type: list, optional: [weighed] or [not_weighed]
            Type of algorithm
        """

        algoritm_type = algoritm_type or 'not_weighed'
        use_components = use_components or ['name']
        type_option = type_option or ['char']
        n_char = n_char or [3]
        n_word = n_word or [1]

        if 'char' not in type_option and 'word' not in type_option:
            assert False, "Проверьте значение параметра type_option."

        if 'name' not in use_components and 'addr' not in use_components:
            assert False, "Проверьте значение параметра use_components."

        for i in use_components:
            for j in type_option:
                n_list = n_char if j == 'char' else n_word
                for n in n_list:
                    LpuList.lsh['{}_{}_{}_{}lsh'.format(
                        algoritm_type, i, j,
                        n)] = MinHashLSHForest(self.num_perm)

                    for idx, minhash in enumerate(
                            self.features['{}_{}_{}_{}minhash'.format(
                                algoritm_type, i, j, n)]):
                        LpuList.lsh['{}_{}_{}_{}lsh'.format(
                            algoritm_type, i, j,
                            n)].add(self.indices[idx], minhash)

                    LpuList.lsh['{}_{}_{}_{}lsh'.format(
                        algoritm_type, i, j, n)].index()

        return self
Пример #12
0
class MinHas(BaseANN):
    def __init__(self, metric, n_perm, n_rep):
        if metric not in ('jaccard'):
            raise NotImplementedError(
                "Datasketch doesn't support metric %s" % metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)

    def fit(self, X):
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e).encode('utf-8'))
            self._index.add(str(i), m)
        self._index.index()

    def query(self, v, n):
        m = MinHash(num_perm=self._n_perm)
        for e in v:
            m.update(str(e).encode('utf-8'))
        print(self._index.query(m, n))
        return map(int, self._index.query(m, n))
Пример #13
0
    def _get_forest(self, data, perms):

        # START Time
        self.START_TIME = time.time()

        minhash_list = []

        for text in data['text']:
            min_hashtext = _create_hashtex(text=text,
                                           perms=perms,
                                           language=self.LANGUAGE)
            minhash_list.append(min_hashtext)

        forest = MinHashLSHForest(num_perm=perms)

        for item_index, list_item in enumerate(minhash_list):
            forest.add(item_index, list_item)

        forest.index()

        # END Time
        self.END_TIME = time.time()

        # TIMING LIST
        self.TIMING = [self.END_TIME, self.START_TIME]

        print('It took %s seconds to build forest.' %
              (calculate_duration(self.TIMING)))
        return forest
Пример #14
0
class DataSketch(BaseANN):
    def __init__(self, metric, n_perm, n_rep):
        if metric not in ('jaccard'):
            raise NotImplementedError("Datasketch doesn't support metric %s" % metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)

    def fit(self, X):
        self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm = self._n_perm)
            for e in x:
                m.update(str(e))
            self._index.add(str(i), m)
        self._index.index()

    def query(self, v, n):
        m = MinHash(num_perm = self._n_perm)
        for e in v:
            m.update(str(e))
        return map(int, self._index.query(m, n))
Пример #15
0
def getMinhashforest2(minhashs):
    # Create a MinHash LSH Forest with the same num_perm parameter
    forest = MinHashLSHForest(num_perm=128)
    for i in range(len(minhashs)):
        # Add m2 and m3 into the index
        forest.add(i, minhashs[i])
    # IMPORTANT: must call index() otherwise the keys won't be searchable
    forest.index()
    return forest
def build_lsh_forest_hash(game_data):
    forest = MinHashLSHForest(num_perm=_utils.HASH_REZ)
    for ind, row in game_data.iterrows():
        try:
            forest.add(f"{row['title']} (id:{row['id']})", row['_sim_hash'])
        except ValueError:
            print(f"{row['title']} already added")
        except:
            raise
    forest.index()
    return forest
Пример #17
0
def construct_lsh(obj_dict):
    forest = MinHashLSHForest(num_perm=128)
    keys = obj_dict.keys()
    values = obj_dict.values()
    ms = []
    for i in range(len(keys)):
        temp = MinHash(num_perm=128)
        for d in values[i]:
            temp.update(d.encode('utf8'))
        ms.append(temp)
        forest.add(keys[i], temp)
    forest.index()
    return forest, keys, ms
Пример #18
0
def store_lsh():
    forest = MinHashLSHForest(num_perm=128)
    documents_en = docs_col.find({"lang": 'english'})
    for item in documents_en:
        minhash = MinHash(num_perm=128)
        ngrams = ngrams_token(remove_punctuation(item['content']), 3)
        for ngram in ngrams:
            minhash.update(ngram.encode("utf-8"))
        forest.add(str(item["_id"]), minhash)
    forest.index()
    ouf = open('pickle_ngram.txt', 'wb')
    cPickle.dump(forest, ouf)
    ouf.close()
    return forest
Пример #19
0
def target_lsh(grams):
    lsh_forest = MinHashLSHForest(num_perm=4000, l=200)
    lsh = MinHashLSH(threshold=0.5, num_perm=4000)
    # minhashes = {}
    for c, i in enumerate(grams):
        minhash = MinHash(num_perm=4000)
        i = i.replace(' ', '')
        for d in ngrams(i, 3):
            minhash.update(''.join(d))

        lsh_forest.add(c, minhash)
        lsh_forest.index()
        lsh.insert(c, minhash)
    return lsh_forest, lsh
Пример #20
0
    def build_lsh_forest(self, company_name_column_name):
        """
        Build the LSH forest data structure from the sets of parsed description words for each company

        Parameters:

            company_name_column_name - string; name of the company name column in the company corpus dataframe
        """
        # Note: num_perm is a tuning parameter, but has been abstracted away for simplicity
        #       256 has been found to be a good amount. Increasing it may increase accuracy,
        #       but will decrease speed and increase memory usage. Decreasing will decrease accuracy

        lsh_forest = MinHashLSHForest(num_perm=256)

        iteration = 1

        self.company_name_column_name = company_name_column_name
        self.name_to_index_map = dict(
            zip(self.company_corpus.corpus.loc[:, company_name_column_name],
                self.company_corpus.corpus.index))
        self.index_to_name_map = dict(
            zip(self.company_corpus.corpus.index,
                self.company_corpus.corpus.loc[:, company_name_column_name]))

        sys.stdout.write("Performing LSH...")
        for company in self.company_corpus.corpus.iterrows():

            # Utilize the 'datasketch' library to minhash the company descriptions and hash to LSh forest
            company_name = company[1][company_name_column_name]
            if company_name in self.dict_of_minhash_keys:
                continue
            mh = MinHash(num_perm=256)
            if type(company[1]['rare_words']) is float:
                mh.update(str(company[1]['rare_words']).encode('utf8'))
            else:
                for word in company[1]['rare_words']:
                    mh.update(str(word).encode('utf8'))
            self.dict_of_minhash_keys[company_name] = mh
            lsh_forest.add(company_name, mh)

            iteration += 1
        sys.stdout.write('\n')
        sys.stdout.write("Done performing LSH!\n")

        # Need this line below to be able to query LSH forest! (See datasketch docs on LSH forest for reasoning)
        lsh_forest.index()
        self.lsh_forest = lsh_forest
Пример #21
0
def benchmark_lshforest(num_perm, l, k, index_data, query_data):
    print("Building LSH Forest index")
    forest = MinHashLSHForest(num_perm=num_perm, l=l)
    for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
        forest.add(key, minhash)
    forest.index()
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = forest.query(minhash, k)
        duration = time.clock() - start
        times.append(duration)
        results.append(
            sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                    for key in result],
                   key=lambda x: x[1],
                   reverse=True))
    return times, results
    def __train_LSH(self,data):
        start_time = time.time()
        forest = MinHashLSHForest(num_perm=config.permutations)
        for item in tqdm(data, desc="MinHash Docs.."):
            tag = item['tag']
            tokens = item['data']

            if self.type == 'trigram':
                tokens = self.normalizer.generate_ngrams_char(tokens[0])
            m = MinHash(num_perm=config.permutations)
            for s in tokens:
                m.update(s.encode('utf8'))
            forest.add(tag,m)

        forest.index()
        print('It took %.2f seconds to build forest.' % (time.time() - start_time))
        return forest
Пример #23
0
    def build_lsh_forest(self, company_name_column_name):

        # Note: num_perm is a tuning parameter, but has been abstracted away for simplicity
        #       256 has been found to be a good amount. Increasing it may increase accuracy,
        #       but will decrease speed and increase memory usage. Decreasing will decrease accuracy

        lsh_forest = MinHashLSHForest(num_perm=256)
        iteration = 0

        self.company_name_column_name = company_name_column_name
        self.name_to_index_map = dict(
            zip(self.company_corpus.corpus.loc[:, company_name_column_name],
                self.company_corpus.corpus.index))
        self.index_to_name_map = dict(
            zip(self.company_corpus.corpus.index,
                self.company_corpus.corpus.loc[:, company_name_column_name]))

        graph_size = self.company_corpus.corpus.shape[0]

        for company in self.company_corpus.corpus.iterrows():
            company_name = company[1][company_name_column_name]
            if company_name in self.dict_of_minhash_keys:
                continue
            mh = MinHash(num_perm=256)
            if type(company[1]['rare_words']) is float:
                mh.update(str(company[1]['rare_words']).encode('utf8'))
            else:
                for word in company[1]['rare_words']:
                    mh.update(str(word).encode('utf8'))
            self.dict_of_minhash_keys[company_name] = mh
            lsh_forest.add(company_name, mh)
            if iteration % 10000 is 0 or (iteration + 1) is graph_size:
                if (iteration + 1) is graph_size:
                    iteration += 1
                sys.stdout.write('\r')
                sys.stdout.write(
                    "LSH Forest Build Percent Complete: {0:0.2f}%".format(
                        round((iteration / graph_size) * 100)))
                sys.stdout.flush()
            iteration += 1
        sys.stdout.write('\n')

        # Need this line below !!!!
        lsh_forest.index()
        self.lsh_forest = lsh_forest
Пример #24
0
def toBuildLSH(cleanSongs):
    '''
    :param cleanSongs
    :return: forest, min_hash_list
    '''
    forest = MinHashLSHForest(num_perm=128)
    min_hash_list = []
    for songIndex, song in enumerate(cleanSongs):
        minhash = MinHash(num_perm=128)
        for word in song:
            ### encoding each word
            minhash.update(word.encode('utf8'))
        ### add each song's minhash to the forest as well as min_hash_list
        forest.add(str(songIndex), minhash)
        min_hash_list.append(minhash)

    forest.index()
    return forest, min_hash_list
Пример #25
0
    def get_forest(self, data, perms):

        minhash = []

        for text in data['err']:
            tokens = self.preprocess(text)
            m = MinHash(num_perm=perms)
            for s in tokens:
                m.update(s.encode('utf8'))
            minhash.append(m)

        forest = MinHashLSHForest(num_perm=perms)

        for i, m in enumerate(minhash):
            forest.add(i, m)

        forest.index()

        return forest
Пример #26
0
    def form_lsh(self):
        minhash = []

        for s in self.__items:
            m = MinHash(num_perm=256)
            for q in s:
                m.update(q.encode('utf8'))
            minhash.append(m)

        forest = MinHashLSHForest(num_perm=256)

        for i, m in enumerate(minhash):
            forest.add(i, m)

        forest.index()
        self.__forest = forest
        self.__hashlist = minhash

        return forest
Пример #27
0
def build_lsh_forest(columns, override=False):
    """
    Builds a minHash LSH forest which can be used to query top-k columns with maximum Jaccard similarity
    @param override:
    @param columns:
    @return:
    """
    file_path = f'{os.environ["WORKING_DIRECTORY"]}/results/forest.obj'
    if override or not os.path.isfile(file_path):
        forest = MinHashLSHForest(num_perm=NUM_PERM)
        for column in columns:
            forest.add(f'{column["table"]}.{column["column"]}', deserialize_minhash(column))
        forest.index()
        with open(file_path, 'wb') as file:
            pickle.dump(forest, file)
        return forest

    with open(file_path, 'rb') as file:
        forest = pickle.load(file)

    return forest
Пример #28
0
def get_forest(records, perms):
    start_time = time.time()

    minhash = []
    for record in records:
        for text in record:
            tokens = preprocess(text)
            m = MinHash(num_perm=perms)
            for s in tokens:
                m.update(s.encode('utf8'))
            minhash.append(m)
        forest = MinHashLSHForest(num_perm=perms)
        print(forest)
    for i, m in enumerate(minhash):
        forest.add(i, m)

    forest.index()

    print('It took %s seconds to build forest.' % (time.time() - start_time))

    return forest
Пример #29
0
 def __datasketch_fit(self):
     if self.kwargs['create']:
         # Create a list of MinHash objects
         min_hash_obj_list = []
         forest = MinHashLSHForest(num_perm=self.kwargs['num_perm'])
         for i in range(len(self.features)):
             min_hash_obj_list.append(
                 MinHash(num_perm=self.kwargs['num_perm']))
             for d in self.features[i]:
                 min_hash_obj_list[i].update(d)
             forest.add(i, min_hash_obj_list[i])
         # IMPORTANT: must call index() otherwise the keys won't be searchable
         forest.index()
         with open(self.kwargs['file_path'], "wb") as f:
             pickle.dump(forest, f)
             pickle.dump(min_hash_obj_list, f)
         self.predictor = [forest, min_hash_obj_list]
     else:
         with open(self.kwargs['file_path'], "rb") as f:
             forest = pickle.load(f)
             min_hash_obj_list = pickle.load(f)
             self.predictor = [forest, min_hash_obj_list]
Пример #30
0
def get_forest(data, perms):
    start_time = time.time()

    minhash = []

    for text in data:
        tokens = p.preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf-8'))
        minhash.append(m)

    forest = MinHashLSHForest(num_perm=perms)

    for i, m in enumerate(minhash):
        forest.add(i, m)

    forest.index()

    print('time to build forest: ', (time.time() - start_time))

    return forest
Пример #31
0
data1 = [
    'minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
    'estimating', 'the', 'similarity', 'between', 'datasets'
]
data2 = [
    'minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
    'estimating', 'the', 'similarity', 'between', 'documents'
]
data3 = [
    'minhash', 'is', 'probability', 'data', 'structure', 'for', 'estimating',
    'the', 'similarity', 'between', 'documents'
]

dataset = [[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.]]
# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

for i, data in enumerate(dataset):
    m = MinHash(num_perm=128)
    for d in data:
        m.update(str(d).encode('utf8'))
    forest.add(str(i), m)

# IMPORTANT: must call index() otherwise the keys won't be searchable

pickle.dump(forest, open('forest.lsh', 'wb'))
del forest
forest = pickle.load(open('forest.lsh', 'rb'))

forest.index()
Пример #32
0
    return (hotelName, newMinHash)


def growForest(forest, minHahs):
    for hotel, hash in minHahs:
        forest.add(hotel, hash)


allHotels = getHotelsDict()

file_path = join("data", "lshforrest.p")

# Создаю minHahs для всех слов
if not isfile(file_path):
    # Create a MinHash LSH Forest with the same num_perm parameter
    forest = MinHashLSHForest(num_perm=NN_PERM)
    allKeys = list(allHotels.keys())
    print(len(allKeys))
    with Pool(4) as pool:
        minHahs = pool.map(makeMinHash, allKeys[0:100000])
        print("Done 1!")
        growForest(forest, minHahs)
        minHahs = pool.map(makeMinHash, allKeys[100000:200000])
        print("Done 2!")
        growForest(forest, minHahs)
        minHahs = pool.map(makeMinHash, allKeys[200000:300000])
        print("Done 3!")
        growForest(forest, minHahs)
        minHahs = pool.map(makeMinHash, allKeys[300000:400000])
        print("Done 4!")
        growForest(forest, minHahs)