示例#1
0
class MinHas(BaseANN):
    def __init__(self, metric, n_perm, n_rep):
        if metric not in ('jaccard'):
            raise NotImplementedError(
                "Datasketch doesn't support metric %s" % metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)

    def fit(self, X):
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e).encode('utf-8'))
            self._index.add(str(i), m)
        self._index.index()

    def query(self, v, n):
        m = MinHash(num_perm=self._n_perm)
        for e in v:
            m.update(str(e).encode('utf-8'))
        print(self._index.query(m, n))
        return map(int, self._index.query(m, n))
def mylshforest(corpus):
    #print(len(corpus))
    forest = MinHashLSHForest(num_perm=32)
    score_res = [0]
    mh = []
    for i in range(len(corpus) - 1):
        doc = corpus[i]
        doc2 = corpus[i + 1]
        m = MinHash(num_perm=32)
        for d in doc:
            m.update(d.encode('utf8'))
        forest.add(str(i), m)
        forest.index()
        mh.append(m)

        m2 = MinHash(num_perm=32)
        for d in doc2:
            m2.update(d.encode('utf8'))
        result = forest.query(m2, 10)
        score = 0.0
        for j in range(len(result)):
            score = score + m2.jaccard(mh[int(result[j])])
        if (len(result) > 0):
            score = score / len(result)
        score_res.append(score)
        i = i + 1
    return score_res
示例#3
0
def search_lshforest_jaccard_topk(index_data, query_data, b, r, k):
    (index_sets, index_keys, index_minhashes) = index_data
    (query_sets, query_keys, query_minhashes) = query_data
    num_perm = b * r
    print("Building LSH Forest Index.")
    start = time.perf_counter()
    index = MinHashLSHForest(num_perm=num_perm, l=b)
    # Use the indices of the indexed sets as keys in LSH.
    for i in range(len(index_keys)):
        index.add(i, index_minhashes[num_perm][i])
    index.index()
    end = time.perf_counter()
    print("Indexing time: {:.3f}.".format(end - start))
    print("Querying.")
    times = []
    results = []
    for query_minhash, query_key, query_set in \
            zip(query_minhashes[num_perm], query_keys, query_sets):
        start = time.perf_counter()
        result = index.query(query_minhash, k * 2)
        # Recover the retrieved indexed sets and
        # compute the exact Jaccard similarities.
        result = [[index_keys[i],
                   compute_jaccard(query_set, index_sets[i])] for i in result]
        # Sort by similarity.
        result.sort(key=lambda x: x[1], reverse=True)
        # Take the top k.
        result = result[:k]
        duration = time.perf_counter() - start
        times.append(duration)
        results.append((query_key, result))
        sys.stdout.write(f"\rQueried {len(results)} sets")
    sys.stdout.write("\n")
    return (results, times)
def benchmark_lshforest(num_perm, l, k, index_data, query_data):
    print("Building LSH Forest index")
    forest = MinHashLSHForest(num_perm=num_perm, l=l)
    for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
        forest.add(key, minhash)
    forest.index()
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = forest.query(minhash, k)
        duration = time.clock() - start
        times.append(duration)
        results.append(
            sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                    for key in result],
                   key=lambda x: x[1],
                   reverse=True))
    return times, results
示例#5
0
class DataSketch(BaseANN):
    def __init__(self, metric, n_perm, n_rep):
        if metric not in ('jaccard'):
            raise NotImplementedError("Datasketch doesn't support metric %s" % metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)

    def fit(self, X):
        self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm = self._n_perm)
            for e in x:
                m.update(str(e))
            self._index.add(str(i), m)
        self._index.index()

    def query(self, v, n):
        m = MinHash(num_perm = self._n_perm)
        for e in v:
            m.update(str(e))
        return map(int, self._index.query(m, n))
示例#6
0
# 得到分词后的documents
documents = []
for item_text in sentences:
    # 将item_text进行分词
    item_str = get_item_str(item_text)
    documents.append(item_str)

# 创建LSH Forest及MinHash对象
minhash_list = []
forest = MinHashLSHForest()
for i in range(len(documents)):
    # 得到train_documents[i]的MinHash
    temp = get_minhash(documents[i])
    minhash_list.append(temp)
    forest.add(i, temp)
# index所有key,以便可以进行检索
forest.index()

query = '00:01:36,2019天猫双11总成交额超100亿元'
# 将item_text进行分词
item_str = get_item_str(query)
# 得到item_str的MinHash
minhash_query = get_minhash(item_str)

# 查询forest中与m1相似的Top-K个邻居
result = forest.query(minhash_query, 3)
for i in range(len(result)):
    print(result[i], minhash_query.jaccard(minhash_list[result[i]]),
          documents[result[i]].replace(' ', ''))
print("Top 3 邻居", result)
示例#7
0
    def saver(self, i, q, retq, matq, l):
        print_start = t.time()
        save_start = t.time()
        global_time = t.time()
        chunk_size = 100
        count = 0
        forest = MinHashLSHForest(num_perm=self.numperm)

        taxstr = ''
        if self.tax_filter is None:
            taxstr = 'NoFilter'
        if self.tax_mask is None:
            taxstr += 'NoMask'
        else:
            taxstr = str(self.tax_filter)
        dataset_name = self.saving_name + '_' + taxstr
        self.errorfile = self.saving_path + 'errors.txt'
        with open(self.errorfile, 'w') as hashes_error_files:
            with h5py.File(self.hashes_path, 'w', libver='latest') as h5hashes:
                datasets = {}
                if dataset_name not in h5hashes.keys():
                    if self.verbose == True:
                        print('creating dataset')
                        print(dataset_name)
                        print('filtered at taxonomic level: ' + taxstr)
                    h5hashes.create_dataset(dataset_name + '_' + taxstr,
                                            (chunk_size, 0),
                                            maxshape=(None, None),
                                            dtype='int32')
                    datasets[dataset_name] = h5hashes[dataset_name + '_' +
                                                      taxstr]
                    if self.verbose == True:
                        print(datasets)
                    h5flush = h5hashes.flush
                print('saver init ' + str(i))
                while True:
                    this_dataframe = retq.get()
                    if this_dataframe is not None:
                        if not this_dataframe.empty:
                            hashes = this_dataframe['hash'].to_dict()
                            print(str(this_dataframe.Fam.max()) + 'fam num')
                            print(str(count) + ' done')
                            hashes = {
                                fam: hashes[fam]
                                for fam in hashes if hashes[fam]
                            }
                            [
                                forest.add(str(fam), hashes[fam])
                                for fam in hashes
                            ]
                            for fam in hashes:
                                if len(datasets[dataset_name]) < fam + 10:
                                    datasets[dataset_name].resize(
                                        (fam + chunk_size,
                                         len(hashes[fam].hashvalues.ravel())))
                                datasets[dataset_name][
                                    fam, :] = hashes[fam].hashvalues.ravel()
                                count += 1
                            if t.time() - save_start > 200:
                                print(t.time() - global_time)
                                forest.index()
                                print(forest.query(hashes[fam], k=10))
                                h5flush()
                                save_start = t.time()
                                with open(self.lshforestpath,
                                          'wb') as forestout:
                                    forestout.write(pickle.dumps(forest, -1))
                                if self.verbose == True:
                                    print('save done at' +
                                          str(t.time() - global_time))
                        else:
                            print(this_dataframe)
                    else:
                        if self.verbose == True:
                            print('wrap it up')
                        with open(self.lshforestpath, 'wb') as forestout:
                            forestout.write(pickle.dumps(forest, -1))
                        h5flush()
                        if self.verbose == True:
                            print('DONE SAVER' + str(i))
                        break
示例#8
0
class DataSketch(BaseANN):
    def __init__(self, metric, n_perm, n_rep, n_leaves):
        if metric not in ('jaccard'):
            raise NotImplementedError("Datasketch doesn't support metric %s" %
                                      metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._n_leaves = n_leaves
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d, n_leaves=%d)' % (
            n_perm, n_rep, n_leaves)

    def fit(self, X):
        self.index = numpy.empty([0, 32])
        self._index_minhash = []
        self._ball_index = []
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)

        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e).encode('utf-8'))
            self._index.add(str(i), m)
            #self.index.append(m.digest())
            self.index = numpy.vstack((self.index, m.digest()))
            self._ball_index.append(m.digest())
            self._index_minhash.append(m)
        self._index.index()
        self._X = X

        self.tree = BallTree(self.index, leaf_size=self._n_leaves)

        # self._annoy = annoy.AnnoyIndex(X.shape[1], metric='euclidean')
        # for i, x in enumerate(X):
        #     self._annoy.add_item(i, x.tolist())
        # self._annoy.build(100)

    def query(self, v, n):
        print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
        m = MinHash(num_perm=self._n_perm)
        for e in v:
            m.update(str(e).encode('utf-8'))

        # for i in self._annoy.get_nns_by_vector(v.tolist(), n, 100):
        #     print(self._index_minhash[int(i)].jaccard(m))

        dist, ind = self.tree.query([m.digest()], k=n)
        for i in ind[0]:
            # print(i)
            print(self._index_minhash[int(i)].jaccard(m))
        print("=======================")
        brute_indices = self.query_with_distances(m.digest(), n)
        for i in brute_indices:
            print(self._index_minhash[int(i)].jaccard(m))
        print("-----------------------")
        ind2 = self._index.query(m, n)
        for i in ind2:
            print(self._index_minhash[int(i)].jaccard(m))

        # return map(int, ind[0])
        return self.query_with_distances(m.digest(), n)

    popcount = []
    for i in range(256):
        popcount.append(bin(i).count("1"))

    def query_with_distances(self, v, n):
        """Find indices of `n` most similar vectors from the index to query vector `v`."""
        if self._metric == 'jaccard':
            dists = numpy.array(
                [pd[self._metric]['distance'](v, e) for e in self.index])
        else:
            assert False, "invalid metric"  # shouldn't get past the constructor!
        # partition-sort by distance, get `n` closest
        nearest_indices = dists.argsort()[-n:][::-1]

        return nearest_indices
示例#9
0
class LshNN(ProgramNN):
	CACHE_DIR = 'cache/'

	def __init__(self, sampledDataPath, num_perm=128, top_k=1, evict_cache=False):
		"""
		An agent class to find rubric sampled nearest neighbour of a given
		program by using a MinHash LSH forest.

		"""
		self.sampledDataPath = sampledDataPath
		self.num_perm = num_perm
		self.top_k = top_k
		self.evict_cache = evict_cache
		self.rawProgramData, self.sampledData = self.loadSyntheticData()
		self.create_lsh_forest()


	def create_lsh_forest(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_forest.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			# load precomputed
			print('Loading cached forest')
			self.forest = load_pickle(cache_file)
		else:
			sampledSets = self.processData(self.sampledData)
			self.sampledMinHashes = self.createMinHashSet(sampledSets)

			self.forest = MinHashLSHForest(num_perm=self.num_perm)
			for prog_idx, minHash in enumerate(self.sampledMinHashes):
				self.forest.add(prog_idx, minHash)

			self.forest.index()

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_pickle(self.forest, cache_file)

	def minHash(self, code_tokens):
		minHash = MinHash(num_perm=self.num_perm)
		for d in code_tokens: # TODO modify this for n-grams
			minHash.update("".join(d).encode('utf-8'))

		return minHash

	# create minHash objects for every dataset
	def createMinHashSet(self, dataset):
		minHashes = []
		for code in tqdm(dataset):
			minHashes.append(self.minHash(code))
		return minHashes

	def multi_dict_get(self, key, all_dicts):
		for dic in all_dicts:
			if key in dic:
				return dic[key]
		raise ValueError('Key not in any of the dictionaries')

	def loadSyntheticData(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_programs.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			data = load_json(cache_file)
			prog_items = data['raw_programs']
			anon_progs = data['anon_programs']
		else:
			standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME
			uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME
			tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME
			standardDict = pickle.load(open(standard_path, "rb" ))
			uniformDict = pickle.load(open(uniform_path, "rb" ))
			temperedDict =  pickle.load(open(tempered_path, "rb" ))

			all_dicts = [standardDict, uniformDict, temperedDict]

			# this step is not stable across different runs if caching forest
			# so this needs to be cached too
			prog_items = list(standardDict.keys() | uniformDict.keys() | temperedDict.keys())
			anon_progs = [self.multi_dict_get(prog, all_dicts) for prog in prog_items]
			data = dict(raw_programs=prog_items, anon_programs=anon_progs)

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_json(data, cache_file)

			# if we dont load cache here, we should regenerate forest too
			self.evict_cache = True

		return prog_items, anon_progs



	def transformCode(self, program):
		splitCode = program.split()
		return splitCode
		#return ngrams(splitCode, 3)

	# tokenize every sentence and return a list of sentences
	def processData(self, dataset):
		processed = []
		for datum in dataset:
			transformedCode = self.transformCode(datum)
			processed.append(transformedCode)
		return processed

	def findNearestNeighbours(self, studentProgram, **kwargs):
		minHash = self.minHash(self.transformCode(studentProgram))
		result = self.forest.query(minHash, self.top_k)
		top_k_programs_anon = [self.sampledData[idx] for idx in result]
		top_k_programs = [self.rawProgramData[idx] for idx in result]
		#return top_k_programs, top_k_programs_anon
		return top_k_programs
示例#10
0
class AutoTag():

    def __init__(self, num_permutation=60):
        self.__num_permutation = num_permutation
        self.__forest = MinHashLSHForest(self.__num_permutation)
        self.__lem = WordNetLemmatizer()
        stop_words = set(stopwords.words("english"))
        stop_words.add('—')
        stop_words.add('And')
        self.__stop_words = stop_words

    def fit(self, csv):
        df = pd.read_csv(csv)
        df.drop_duplicates(subset='webURL', keep=False, inplace=True)
        df.dropna(inplace=True)
        for index, row in df.iterrows():
            min_hash = self.make_min_hash(self.make_clean_words_list(row['Text']))
            self.__forest.add(row['webURL'], min_hash)
            if index % 100 == 0 :print(index, end='\r', flush=True)
        self.__forest.index()


    def make_clean_words_list(self, text):
        text = re.sub('[^a-zA-Z]', ' ', text)

        #Convert to lowercase
        text = text.lower()

        #remove tags
        text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

        # remove special characters and digits
        text=re.sub("(\\d|\\W)+"," ",text)

        #Lemmatisation
        text = text.split()
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in self.__stop_words]

        return text


    def predict(self, text, num_of_niebhors):
        #TODO : change results into tags
        query = self.make_min_hash(self.make_clean_words_list(text))
        return self.__forest.query(query, num_of_niebhors)




    def make_min_hash(self,words):
        min_hash = MinHash(self.__num_permutation)
        for word in words:
            min_hash.update(word.encode('utf8'))
        return min_hash


    def load_trained_model(self, trained_model_file_name, num_of_permutations):
        self.__forest = pickle.load(open(trained_model_file_name, 'rb'))
        self.__num_permutation = num_of_permutations

    def save_model(self, file_name):
        pickle.dump(self.__forest, open(file_name, 'wb'))
示例#11
0
class duplicate_docs:
    def __init__(self):
        self.lsh = MinHashLSHForest(
            num_perm=config.LSH_CONFIG['num_permutation'])

    def load(self, model):
        print('loading %s ...' % (model))
        if os.path.isfile(model):
            return joblib.load(model)
        else:
            return None

    def save(self, model, path):
        print('saving %s ...' % (path))
        joblib.dump(model, path)
        return

    # load data from list documents
    def run(self, docs):
        count = 1
        for itemid, content in docs.items():
            try:
                doc = document(content)
                self.insert(doc, key=itemid)
                print('\rpushed %d items' % (count)),
                sys.stdout.flush()
                count += 1
            except:
                pass
        self.lsh.index()
        print('')

    def run_ex(self, itemid, content, call_index=True):
        try:
            doc = document(content)
            self.insert(doc, key=itemid)
            if call_index:
                self.lsh.index()
        except:
            pass

    def query(self, doc, topn=1000):
        try:
            unicodedata.normalize('NFKC', doc)
            doc = document(doc)
            minhash = doc.get_minhash(doc.k_shingles,
                                      config.MINHASH_CONFIG['num_permutation'])
            return self.lsh.query(minhash, topn)
        except:
            return []

    # insert a document object
    # output: key if document does not exist duplicate item
    # otherwise return alert duplication.
    def insert(self, doc, key=None):
        if key is None:
            key = utils.id_generator()
        minhash = doc.get_minhash(doc.k_shingles,
                                  config.MINHASH_CONFIG['num_permutation'])
        if len(doc.k_shingles) == 0:
            return u'Does not insert this document to database.\nDocument\'s shingle = 0.\nDocument need to contain at least %d word' \
                   % (config.SHINGLE_CONFIG['k'])
        self.lsh.add(key, minhash)

    def load_model(self):
        self.lsh = self.load('model/lsh.pkl')
        self.docs = self.load('model/docs.pkl')
        self.docs_time = self.load('model/docs_time.pkl')
        if self.lsh != None and self.docs != None and self.docs_time != None:
            return True
        return False

    def save_model(self):
        utils.mkdir('model')
        self.save(self.lsh, 'model/lsh.pkl')
        self.save(self.docs, 'model/docs.pkl')
        self.save(self.docs_time, 'model/docs_time.pkl')
示例#12
0
    lsh.add(artist,a)

lsh.index()
tester = {}
with open('tester.json') as file:
    tester = json.loads(file.read().encode('latin-1'))
numcorrect_1 =0
numcorrect_5 = 0
numcorrect_10 = 0
total = 0
for artist,songlist in tester.items():
    for song in songlist:
        m1 = MinHash(num_perm=128)
        songp = clean_text(song['lyrics'])
        for d in songp:
            m1.update(d.encode('utf8'))
        result = lsh.query(m1, 10)
        if len(result):
            total += 1
            if artist in result:
                numcorrect_10 += 1
            if len(result) >= 5:
                if artist in result[:5]:
                    numcorrect_5 += 1
            if artist == result[0]:
                numcorrect_5 += 1
print("Recall @1,@5,@10, total")
print(numcorrect_1/total)
print(numcorrect_5/total)
print(numcorrect_10/total)
print(total)
示例#13
0
    # print(query_set)

    # Создаём поисковый minHash
    searchMinHash = MinHash(num_perm=NN_PERM)
    # print("===========================================================")
    # print("Query =", value[1])

    # Добовляем в него слова после спел корректишина
    for word in query_set:
        searchMinHash.update(correction(word).encode('utf8'))

    # allHotels[row["name"]] = (unidecode(row["name"]), row["id"], row['stars'])
    # Using m1 as the query, retrieve top 20 keys that have the higest Jaccard

    # берём первые 500
    result = forest.query(searchMinHash, 500)
    # сортируем по метрике похожести и сходству длины фразы запроса
    result = sorted(result,
                    key=lambda hotel:
                    (-setDistance(preparedSetHotel(hotel), query_set),
                     abs(len(hotel) - len(query_string))))
    end = time.time()
    # result1 = sorted(result, key=lambda hotel: (-setDistance(query_set, preparedSet(hotel)), abs(len(hotel) - len(query_string))))
    # result2 = sorted(result, key=lambda hotel: (-realDistance(preparedSet(hotel), query_set), abs(len(hotel) - len(query_string))))
    # print("Res 1 ________")
    # print(*result[:5], sep='\n')
    # print("Res 2 ________")
    # print(*result1[:5], sep='\n')
    # print("Res 3 ________")
    # print(*result[:5], sep='\n')
    # print(result)
示例#14
0
    mh = MinHash()
    for d in content:
        mh.update(d.encode('utf8'))
    return mh


# 创建MinHash及LSH Forest对象
minhash_list = []
forest = MinHashLSHForest()
for i in range(len(documents)):
    temp = get_minhash(documents[i])
    minhash_list.append(temp)
    forest.add(i, temp)
forest.index()
"""3.0 寻找某句子的相似对象"""
random.seed(666)
n = random.randint(0, len(sentences))  # 随机生成目标句子的index
target = sentences[n]
print("目标句子:", target)

split_target = split_content(target.replace('\u200b', ''))  # 对目标句子进行分词
minhash_target = get_minhash(split_target)  # 目标句子的MinHash处理

sim_results = forest.query(minhash_target, 3)  # 查找目标句子的Top-3相似句子
for i in range(len(sim_results)):
    print("-" * 30)
    print('相似句子索引:', sim_results[i])
    print('与目标句子的Jaccard相似度:',
          minhash_target.jaccard(minhash_list[sim_results[i]]))
    print('相似句子内容:', sentences[sim_results[i]])
示例#15
0
def printStats(json_filename):
    with open(json_filename) as json_data:
        d = json.load(json_data)

        # Query simple index: queryNum -> queryText
        queryIndex = {}

        # Index of queries as a LSH forest for top-k similar queries.
        queriesLSHIndex = MinHashLSHForest(num_perm=128)

        # You can grok the CSV from stdout by using cut, e.g.,
        #
        # $ python analyzer.py -i ../../data/queries_ASTs.json | grep "csv:" | cut -d':' -f2 > /tmp/out.csv
        print 'csv:"queryNum","numExplicitJoins","referencedTables","groupByColumns","numGroupByClauses"'

        for queryNum, entry in enumerate(d):
            print '\n=> Stats for query number \"%s:\"' % queryNum

            # Group by clauses.
            groupByColumns = jmespath.search(
                'ast.statement[*].group.expression[*].name[]', entry)
            print 'groupBy columns: %s' % groupByColumns

            # Base tables when the query has no joins.
            baseTables = jmespath.search(
                'ast.statement[?from.variant == \'table\'].from.name[]', entry)
            print 'baseTables: %s' % baseTables

            # Base tables when the query has joins.
            baseTables += jmespath.search(
                'ast.statement[?from.variant == \'join\'].from.source.name[]',
                entry)
            print 'baseTables (with joins): %s' % baseTables

            # Join tables.
            joinTables = jmespath.search(
                'ast.statement[?from.variant == \'join\'].from.map[*].source.name[]',
                entry)
            print 'joinTables: %s' % joinTables

            # All tables mentioned in the query
            referencedTables = baseTables + joinTables

            # Joins.
            joinPathPrefix = 'ast.statement[*].from.map[*].constraint.on'
            joinsLeft = jmespath.search(joinPathPrefix + '.left.name', entry)
            joinsRight = jmespath.search(joinPathPrefix + '.right.name', entry)
            print 'explicit joins (left-hand side): %s' % joinsLeft
            print 'explicit joins (right-hand side): %s' % joinsRight

            # Text
            queryText = jmespath.search('queryText', entry)

            # Index it into an LSH forest for top-k textually similar queries.
            queryLSH = getQueryMinHash(queryText)
            queryIndex[queryNum] = {
                'queryText': queryText,
                'queryLSH': queryLSH
            }
            queriesLSHIndex.add(queryNum, queryLSH)

            # Sort for a prettier CSV dump.
            referencedTables.sort()
            groupByColumns.sort()
            # CSV header:
            # queryNum,numExplicitJoins,referencedTables,groupByColumns,numGroupByColumns
            print 'queryNum = %s' % queryNum
            print 'csv:"%s","%s","%s","%s","%s"' % (
                queryNum, len(joinsLeft[0]) if len(joinsLeft) > 0 else 0,
                ','.join(referencedTables), ','.join(groupByColumns),
                len(groupByColumns))

            # Populate a reverse index from table to script.
            tableToQuery = {}
            for referencedTable in referencedTables:
                if referencedTable not in tableToQuery:
                    tableToQuery[referencedTable] = [queryNum]
                else:
                    tableToQuery[referencedTable].append(queryNum)

        # Sample search on LSH forest index: top-3 most similar queries.
        queriesLSHIndex.index()
        k = 3
        queryNum = 10
        query = queryIndex[queryNum]
        print '\n\nTop %s queries similar to "%s":' % (k, query['queryText'])
        top_k = queriesLSHIndex.query(query['queryLSH'], k)
        for k in top_k:
            print '\n"%s"' % queryIndex[k]['queryText']
示例#16
0
    pbar.close()
    forest.index()

    true_labels = []
    pred_labels = []
    zipf_labels = []
    for program, label in real_data.items():
        zipf = real_zipf[program]
        try:
            tokens = program.split()
        except:
            continue
        minhash = MinHash()
        for token in tokens:
            minhash.update(token.encode('utf-8'))
        result = forest.query(minhash, args.k)
        if len(result) == 0:
            continue
        lset = []
        for r in result:
            l = string2label[r]
            if args.dataset == 'codeorg':
                GRAMMAR_DIR = 'src/rubricsampling/grammars/codeorg9_ability'
                inf_e = EngineConditioned(GRAMMAR_DIR,
                                          l,
                                          choice_style='standard')
                _program, _labels, _decisions, _, _ = inf_e.renderProgram()
                _labels = vectorize_labels(_labels, n_labels, label2ix)
                lset.append(_labels)
            elif args.dataset == 'citizenship':
                l = int(l['correctStrategy'])
def main():
    corpus = {}
    with open('corpus_data/preprocessedf_corpus.json') as file:
        corpus = json.loads(file.read().encode('Utf-8'))

    def processLyrics(lyrics):
        authors = {}
        for author in lyrics:
            for song in lyrics[author]:
                lyric = re.sub(r'\[[^>]+\]', '', song["lyrics"])
                lyric = re.sub(r'\([^>]+\)', '', lyric)
                lyric = re.sub(r'\{[^>]+\}', '', lyric)
                lyric = lyric.split(r'\s')
                for line in lyric:
                    line = re.sub(r'\n', ' ', line)
                    if author not in authors:
                        authors[author] = line
                    else:
                        authors[author] += line
        return authors

    import nltk
    from nltk.corpus import stopwords
    from collections import defaultdict
    from collections import Counter

    nltk.download('wordnet')
    from nltk.corpus import wordnet as wn

    def get_lemma(word):
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma

    from nltk import word_tokenize

    def clean_text(text, ar):
        tokenized_text = word_tokenize(text.lower())
        tokenized_text = [token for token in tokenized_text if len(token) > 5]
        cleaned_text = [
            t for t in tokenized_text
            if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
        ]
        if ar == 'sw':
            cleaned_text = [t for t in cleaned_text if t not in STOPWORDS]
        if ar == 'lm':
            cleaned_text = [get_lemma(token) for token in cleaned_text]
        if ar == 'rw':
            cleaned_text = [
                token for token in cleaned_text if token not in PROFANITY
            ]
        return cleaned_text

    STOPWORDS = set(stopwords.words('english'))
    with open('corpus_data/rapsw.txt') as infile:
        infile = infile.read()
        infile = infile.split()
        PROFANITY = set(infile)

    corpus = processLyrics(corpus)

    for author, text in corpus.items():
        corpus[author] = clean_text(text, sys.argv[1])

    artist_shingle = defaultdict(list)
    for artist, lyrics in corpus.items():
        #tokens = [w for w in tokens if not w in sw]
        #shingle3 = set([tuple(tokens[i:i+3]) for i in range(len(tokens) - 3 + 1) if len(tokens[i]) < 10])
        #shingle2 = set([tuple(tokens[i:i+2]) for i in range(len(tokens) - 2 + 1) if len(tokens[i]) < 10])
        shingle1 = lyrics
        # set([tokens[i] for i in range(len(tokens) - 1 + 1) if len(tokens[i]) < 4])
        artist_shingle[artist].append(shingle1)
        #artist_shingle[artist].append(shingle2)
        #artist_shingle[artist].append(shingle3)

    from datasketch import MinHashLSHForest, MinHash
    from sklearn.metrics import jaccard_similarity_score

    listlsh = []
    lsh = MinHashLSHForest(num_perm=128)
    for artist, sets in artist_shingle.items():
        a = MinHash(num_perm=128)
        for d in sets[0]:
            a.update(d.encode('utf8'))
        listlsh.append(a)
        lsh.add(artist, a)

    lsh.index()

    m1 = MinHash(num_perm=128)
    g = []
    with open(sys.argv[2]) as g:
        g = g.read()
        g = g.split()
    for d in g:
        m1.update(d.encode('utf8'))

    result = lsh.query(m1, 5)
    print(" (Up to) Top 5 candidates", result)
示例#18
0
data3 = ['minhash', 'is', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
for d in data3:
    m3.update(d.encode('utf8'))

# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

# Add m2 and m3 into the index
forest.add("m2", m2)
forest.add("m3", m3)

# IMPORTANT: must call index() otherwise the keys won't be searchable
forest.index()

# Check for membership using the key
print("m2" in forest)
print("m3" in forest)

# Using m1 as the query, retrieve top 2 keys that have the higest Jaccard
result = forest.query(m1, 1)
print("Top 2 candidates", result)
class LshSamplesEval:

	'''STUDENT_PATH = '../studentData/liftoff/'
	STANDARD_PATH = '../data/raw/liftoff/standard/'
	UNIFORM_PATH = '../data/raw/liftoff/uniform/'
	TEMPERED_PATH = '../data/raw/liftoff/tempered/'''

	def __init__(self, studentDataPath, sampledDataPath):
		print('Loading data...')
		self.studentDataPath = studentDataPath
		print(self.studentDataPath)
		self.sampledDataPath = sampledDataPath
		print(self.sampledDataPath)
		self.sampledData = self.loadSyntheticData()
		self.studentData = self.loadStudentData()

	def loadStudentData(self):
		path = self.studentDataPath + STUDENT_NAME
		datadict = pickle.load(open(path, "rb" ))
		return list(datadict.keys())

	def loadSyntheticData(self):
		standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME
		uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME
		tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME
		standardDict = pickle.load(open(standard_path, "rb" ))
		uniformDict = pickle.load(open(uniform_path, "rb" ))
		temperedDict =  pickle.load(open(tempered_path, "rb" ))
		#import pdb; pdb.set_trace()
		return list(standardDict.values()) + list(uniformDict.values()) + list(temperedDict.values())

	def computeLshNN(self):
		print('Processing sampled and student data...')
		sampledSets = self.processData(self.sampledData)
		studentSets = self.processData(self.studentData)

		print('Finding nearest neighbors from sampled data...')
		sampledScores = self.constructNNList(studentSets, sampledSets, self.studentData, self.sampledData)

		print('Found nearest neighbors for data!')

		# self.constructHistogram(sampledScores)

		return sampledScores

	# tokenize every sentence and return a list of sentences
	def processData(self, dataset):
		processed = []
		for datum in dataset:
			splitCode = datum.split()
			processed.append(splitCode)
		return processed

	# runs MinHashLsh
	def constructNNList(self, studentSets, sampledSets, studentData, sampledData):
		print('Creating min-hashes for student data')
		self.studentMinHashes = self.createMinHash(studentSets)
		print('Creating min-hashes for rubric data')
		self.sampledMinHashes = self.createMinHash(sampledSets)

		self.forest = MinHashLSHForest(num_perm = 128)
		i = 0
		for minHash in self.sampledMinHashes:
			self.forest.add(str(i), minHash)
			i += 1

		self.forest.index()

		print("calculating nearest neighbor")
		scores = []
		for i, query in enumerate(tqdm(self.studentMinHashes)):
			result = self.forest.query(query, 1)
			indexMatch = int(result[0])
			# Uncomment these to print examples of 
			# student code and their nearest neighbor!
			print(result)
			print('Student Code: \n')
			print(studentData[i])
			print('\n')
			print('Closest Sampled Code: \n')
			print(sampledData[indexMatch])
			print('\n')
			score = self.sampledMinHashes[indexMatch].jaccard(query)
			print('Score: \n')

			scores.append(score)

		return scores

	# create minHash objects for every dataset
	def createMinHash(self, dataset):
		minHashes = []
		for code in tqdm(dataset):
			minHash = MinHash(num_perm = 128)
			for d in code: # TODO modify this for n-grams
				minHash.update("".join(d).encode('utf-8'))
			minHashes.append(minHash)
		return minHashes

	def constructHistogram(self, scores):
		plt.hist(scores)
		plt.xlabel('Jaccard Similarity Score')
		plt.ylabel('Counts')
		plt.show()
示例#20
0
    'the', 'similarity', 'between', 'documents'
]

dataset = [[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.]]
# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

for i, data in enumerate(dataset):
    m = MinHash(num_perm=128)
    for d in data:
        m.update(str(d).encode('utf8'))
    forest.add(str(i), m)

# IMPORTANT: must call index() otherwise the keys won't be searchable

pickle.dump(forest, open('forest.lsh', 'wb'))
del forest
forest = pickle.load(open('forest.lsh', 'rb'))

forest.index()

# Check for membership using the key
print("1" in forest)
print("2" in forest)

m = MinHash(num_perm=128)
for d in dataset[0]:
    m.update(str(d).encode('utf8'))
# Using m1 as the query, retrieve top 2 keys that have the higest Jaccard
result = forest.query(m, 10)
print("Top 2 candidates", result)
示例#21
0
# TODO: neither of these work well with puzzles...

df = pd.read_csv(
    '../chess-opening/csvs/lichess_db_standard_rated_2020-08_600+0.csv',
    nrows=1000000)


def create_min_hash(fens):
    min_hash = MinHash(num_perm=128)
    for fen in fens:
        min_hash.update(fen.encode('utf8'))
    return min_hash


user_df = df.groupby('username').agg({'fen': set, 'elo': 'mean'})
user_df['min_hash'] = user_df['fen'].apply(create_min_hash)

forest = MinHashLSHForest(num_perm=128)
for row in user_df.itertuples():
    forest.add(row.Index, row.min_hash)

forest.index()

for i in range(10):
    result = forest.query(user_df['min_hash'][i], 10)

    elos = []
    for username in result:
        elos.append(user_df.loc[username]['elo'])
    print(user_df['elo'][i], np.mean(elos))