예제 #1
0
    def _get_forest(self, data, perms):

        # START Time
        self.START_TIME = time.time()

        minhash_list = []

        for text in data['text']:
            min_hashtext = _create_hashtex(text=text,
                                           perms=perms,
                                           language=self.LANGUAGE)
            minhash_list.append(min_hashtext)

        forest = MinHashLSHForest(num_perm=perms)

        for item_index, list_item in enumerate(minhash_list):
            forest.add(item_index, list_item)

        forest.index()

        # END Time
        self.END_TIME = time.time()

        # TIMING LIST
        self.TIMING = [self.END_TIME, self.START_TIME]

        print('It took %s seconds to build forest.' %
              (calculate_duration(self.TIMING)))
        return forest
예제 #2
0
def mylshforest(corpus):
    #print(len(corpus))
    forest = MinHashLSHForest(num_perm=32)
    score_res = [0]
    mh = []
    for i in range(len(corpus) - 1):
        doc = corpus[i]
        doc2 = corpus[i + 1]
        m = MinHash(num_perm=32)
        for d in doc:
            m.update(d.encode('utf8'))
        forest.add(str(i), m)
        forest.index()
        mh.append(m)

        m2 = MinHash(num_perm=32)
        for d in doc2:
            m2.update(d.encode('utf8'))
        result = forest.query(m2, 10)
        score = 0.0
        for j in range(len(result)):
            score = score + m2.jaccard(mh[int(result[j])])
        if (len(result) > 0):
            score = score / len(result)
        score_res.append(score)
        i = i + 1
    return score_res
예제 #3
0
class MinHas(BaseANN):
    def __init__(self, metric, n_perm, n_rep):
        if metric not in ('jaccard'):
            raise NotImplementedError(
                "Datasketch doesn't support metric %s" % metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)

    def fit(self, X):
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e).encode('utf-8'))
            self._index.add(str(i), m)
        self._index.index()

    def query(self, v, n):
        m = MinHash(num_perm=self._n_perm)
        for e in v:
            m.update(str(e).encode('utf-8'))
        print(self._index.query(m, n))
        return map(int, self._index.query(m, n))
예제 #4
0
def search_lshforest_jaccard_topk(index_data, query_data, b, r, k):
    (index_sets, index_keys, index_minhashes) = index_data
    (query_sets, query_keys, query_minhashes) = query_data
    num_perm = b * r
    print("Building LSH Forest Index.")
    start = time.perf_counter()
    index = MinHashLSHForest(num_perm=num_perm, l=b)
    # Use the indices of the indexed sets as keys in LSH.
    for i in range(len(index_keys)):
        index.add(i, index_minhashes[num_perm][i])
    index.index()
    end = time.perf_counter()
    print("Indexing time: {:.3f}.".format(end - start))
    print("Querying.")
    times = []
    results = []
    for query_minhash, query_key, query_set in \
            zip(query_minhashes[num_perm], query_keys, query_sets):
        start = time.perf_counter()
        result = index.query(query_minhash, k * 2)
        # Recover the retrieved indexed sets and
        # compute the exact Jaccard similarities.
        result = [[index_keys[i],
                   compute_jaccard(query_set, index_sets[i])] for i in result]
        # Sort by similarity.
        result.sort(key=lambda x: x[1], reverse=True)
        # Take the top k.
        result = result[:k]
        duration = time.perf_counter() - start
        times.append(duration)
        results.append((query_key, result))
        sys.stdout.write(f"\rQueried {len(results)} sets")
    sys.stdout.write("\n")
    return (results, times)
예제 #5
0
def getMinhashforest2(minhashs):
    # Create a MinHash LSH Forest with the same num_perm parameter
    forest = MinHashLSHForest(num_perm=128)
    for i in range(len(minhashs)):
        # Add m2 and m3 into the index
        forest.add(i, minhashs[i])
    # IMPORTANT: must call index() otherwise the keys won't be searchable
    forest.index()
    return forest
def build_lsh_forest_hash(game_data):
    forest = MinHashLSHForest(num_perm=_utils.HASH_REZ)
    for ind, row in game_data.iterrows():
        try:
            forest.add(f"{row['title']} (id:{row['id']})", row['_sim_hash'])
        except ValueError:
            print(f"{row['title']} already added")
        except:
            raise
    forest.index()
    return forest
예제 #7
0
def create_LSH_Forest():
    global forest
    if os.path.isfile(LSH_FOREST_FILE):
        load_forest()
    else:
        forest = MinHashLSHForest(num_perm=128)
    train_records = glob.glob("dataset/train*.tfrecord")
    validate_records = glob.glob("dataset/validate*.tfrecord")
    all_records = train_records + validate_records
    dataset = tf.data.TFRecordDataset(all_records)
    iterator = dataset.make_one_shot_iterator()
    count = 0
    next_element = iterator.get_next()
    updated = False
    with tf.Session() as sess:
        try:
            while True:
                if count % 10000 == 0:
                    print "[SimpleVideoSearch][{}] Processed {} records from the dataset so far".format(
                        datetime.now(), count)
                if updated and count % 100000 == 0:
                    with open(LSH_FOREST_FILE, 'wb') as forest_file:
                        forest.index()
                        pickle.dump(forest, forest_file,
                                    pickle.HIGHEST_PROTOCOL)
                    print "[SimpleVideoSearch][{}] Updated LSH Forest file".format(
                        datetime.now(), count)
                exampleBinaryString = sess.run(next_element)
                example = tf.train.Example.FromString(exampleBinaryString)
                count += 1
                example_id = example.features.feature["id"].bytes_list.value[0]
                if example_id not in forest:
                    if not updated:
                        updated = True
                        print '[SimpleVideoSearch][{}] First update at record {}'.format(
                            datetime.now(), count)
                    dataset_labels_full = convert_dataset_labels_to_list(
                        example.features.feature["labels"].int64_list.value)
                    minhash = MinHash(num_perm=128)
                    for label in dataset_labels_full:
                        minhash.update(label)
                    forest.add(example_id, minhash)
        except tf.errors.OutOfRangeError:
            print "[SimpleVideoSearch][{}] Done iterating through dataset".format(
                datetime.now())
        finally:
            print "[SimpleVideoSearch][{}] Processed {} records from the dataset".format(
                datetime.now(), count)
            forest.index()
            with open(LSH_FOREST_FILE, 'wb') as forest_file:
                pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL)
            print "[SimpleVideoSearch][{}] Finished creating LSH Forest file".format(
                datetime.now(), count)
예제 #8
0
def construct_lsh(obj_dict):
    forest = MinHashLSHForest(num_perm=128)
    keys = obj_dict.keys()
    values = obj_dict.values()
    ms = []
    for i in range(len(keys)):
        temp = MinHash(num_perm=128)
        for d in values[i]:
            temp.update(d.encode('utf8'))
        ms.append(temp)
        forest.add(keys[i], temp)
    forest.index()
    return forest, keys, ms
예제 #9
0
def target_lsh(grams):
    lsh_forest = MinHashLSHForest(num_perm=4000, l=200)
    lsh = MinHashLSH(threshold=0.5, num_perm=4000)
    # minhashes = {}
    for c, i in enumerate(grams):
        minhash = MinHash(num_perm=4000)
        i = i.replace(' ', '')
        for d in ngrams(i, 3):
            minhash.update(''.join(d))

        lsh_forest.add(c, minhash)
        lsh_forest.index()
        lsh.insert(c, minhash)
    return lsh_forest, lsh
예제 #10
0
def store_lsh():
    forest = MinHashLSHForest(num_perm=128)
    documents_en = docs_col.find({"lang": 'english'})
    for item in documents_en:
        minhash = MinHash(num_perm=128)
        ngrams = ngrams_token(remove_punctuation(item['content']), 3)
        for ngram in ngrams:
            minhash.update(ngram.encode("utf-8"))
        forest.add(str(item["_id"]), minhash)
    forest.index()
    ouf = open('pickle_ngram.txt', 'wb')
    cPickle.dump(forest, ouf)
    ouf.close()
    return forest
예제 #11
0
    def build_lsh_forest(self, company_name_column_name):
        """
        Build the LSH forest data structure from the sets of parsed description words for each company

        Parameters:

            company_name_column_name - string; name of the company name column in the company corpus dataframe
        """
        # Note: num_perm is a tuning parameter, but has been abstracted away for simplicity
        #       256 has been found to be a good amount. Increasing it may increase accuracy,
        #       but will decrease speed and increase memory usage. Decreasing will decrease accuracy

        lsh_forest = MinHashLSHForest(num_perm=256)

        iteration = 1

        self.company_name_column_name = company_name_column_name
        self.name_to_index_map = dict(
            zip(self.company_corpus.corpus.loc[:, company_name_column_name],
                self.company_corpus.corpus.index))
        self.index_to_name_map = dict(
            zip(self.company_corpus.corpus.index,
                self.company_corpus.corpus.loc[:, company_name_column_name]))

        sys.stdout.write("Performing LSH...")
        for company in self.company_corpus.corpus.iterrows():

            # Utilize the 'datasketch' library to minhash the company descriptions and hash to LSh forest
            company_name = company[1][company_name_column_name]
            if company_name in self.dict_of_minhash_keys:
                continue
            mh = MinHash(num_perm=256)
            if type(company[1]['rare_words']) is float:
                mh.update(str(company[1]['rare_words']).encode('utf8'))
            else:
                for word in company[1]['rare_words']:
                    mh.update(str(word).encode('utf8'))
            self.dict_of_minhash_keys[company_name] = mh
            lsh_forest.add(company_name, mh)

            iteration += 1
        sys.stdout.write('\n')
        sys.stdout.write("Done performing LSH!\n")

        # Need this line below to be able to query LSH forest! (See datasketch docs on LSH forest for reasoning)
        lsh_forest.index()
        self.lsh_forest = lsh_forest
    def __train_LSH(self,data):
        start_time = time.time()
        forest = MinHashLSHForest(num_perm=config.permutations)
        for item in tqdm(data, desc="MinHash Docs.."):
            tag = item['tag']
            tokens = item['data']

            if self.type == 'trigram':
                tokens = self.normalizer.generate_ngrams_char(tokens[0])
            m = MinHash(num_perm=config.permutations)
            for s in tokens:
                m.update(s.encode('utf8'))
            forest.add(tag,m)

        forest.index()
        print('It took %.2f seconds to build forest.' % (time.time() - start_time))
        return forest
예제 #13
0
    def build_lsh_forest(self, company_name_column_name):

        # Note: num_perm is a tuning parameter, but has been abstracted away for simplicity
        #       256 has been found to be a good amount. Increasing it may increase accuracy,
        #       but will decrease speed and increase memory usage. Decreasing will decrease accuracy

        lsh_forest = MinHashLSHForest(num_perm=256)
        iteration = 0

        self.company_name_column_name = company_name_column_name
        self.name_to_index_map = dict(
            zip(self.company_corpus.corpus.loc[:, company_name_column_name],
                self.company_corpus.corpus.index))
        self.index_to_name_map = dict(
            zip(self.company_corpus.corpus.index,
                self.company_corpus.corpus.loc[:, company_name_column_name]))

        graph_size = self.company_corpus.corpus.shape[0]

        for company in self.company_corpus.corpus.iterrows():
            company_name = company[1][company_name_column_name]
            if company_name in self.dict_of_minhash_keys:
                continue
            mh = MinHash(num_perm=256)
            if type(company[1]['rare_words']) is float:
                mh.update(str(company[1]['rare_words']).encode('utf8'))
            else:
                for word in company[1]['rare_words']:
                    mh.update(str(word).encode('utf8'))
            self.dict_of_minhash_keys[company_name] = mh
            lsh_forest.add(company_name, mh)
            if iteration % 10000 is 0 or (iteration + 1) is graph_size:
                if (iteration + 1) is graph_size:
                    iteration += 1
                sys.stdout.write('\r')
                sys.stdout.write(
                    "LSH Forest Build Percent Complete: {0:0.2f}%".format(
                        round((iteration / graph_size) * 100)))
                sys.stdout.flush()
            iteration += 1
        sys.stdout.write('\n')

        # Need this line below !!!!
        lsh_forest.index()
        self.lsh_forest = lsh_forest
예제 #14
0
def toBuildLSH(cleanSongs):
    '''
    :param cleanSongs
    :return: forest, min_hash_list
    '''
    forest = MinHashLSHForest(num_perm=128)
    min_hash_list = []
    for songIndex, song in enumerate(cleanSongs):
        minhash = MinHash(num_perm=128)
        for word in song:
            ### encoding each word
            minhash.update(word.encode('utf8'))
        ### add each song's minhash to the forest as well as min_hash_list
        forest.add(str(songIndex), minhash)
        min_hash_list.append(minhash)

    forest.index()
    return forest, min_hash_list
예제 #15
0
    def get_forest(self, data, perms):

        minhash = []

        for text in data['err']:
            tokens = self.preprocess(text)
            m = MinHash(num_perm=perms)
            for s in tokens:
                m.update(s.encode('utf8'))
            minhash.append(m)

        forest = MinHashLSHForest(num_perm=perms)

        for i, m in enumerate(minhash):
            forest.add(i, m)

        forest.index()

        return forest
예제 #16
0
    def form_lsh(self):
        minhash = []

        for s in self.__items:
            m = MinHash(num_perm=256)
            for q in s:
                m.update(q.encode('utf8'))
            minhash.append(m)

        forest = MinHashLSHForest(num_perm=256)

        for i, m in enumerate(minhash):
            forest.add(i, m)

        forest.index()
        self.__forest = forest
        self.__hashlist = minhash

        return forest
예제 #17
0
def benchmark_lshforest(num_perm, l, k, index_data, query_data):
    print("Building LSH Forest index")
    forest = MinHashLSHForest(num_perm=num_perm, l=l)
    for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
        forest.add(key, minhash)
    forest.index()
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = forest.query(minhash, k)
        duration = time.clock() - start
        times.append(duration)
        results.append(
            sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                    for key in result],
                   key=lambda x: x[1],
                   reverse=True))
    return times, results
예제 #18
0
def build_lsh_forest(columns, override=False):
    """
    Builds a minHash LSH forest which can be used to query top-k columns with maximum Jaccard similarity
    @param override:
    @param columns:
    @return:
    """
    file_path = f'{os.environ["WORKING_DIRECTORY"]}/results/forest.obj'
    if override or not os.path.isfile(file_path):
        forest = MinHashLSHForest(num_perm=NUM_PERM)
        for column in columns:
            forest.add(f'{column["table"]}.{column["column"]}', deserialize_minhash(column))
        forest.index()
        with open(file_path, 'wb') as file:
            pickle.dump(forest, file)
        return forest

    with open(file_path, 'rb') as file:
        forest = pickle.load(file)

    return forest
예제 #19
0
def get_forest(records, perms):
    start_time = time.time()

    minhash = []
    for record in records:
        for text in record:
            tokens = preprocess(text)
            m = MinHash(num_perm=perms)
            for s in tokens:
                m.update(s.encode('utf8'))
            minhash.append(m)
        forest = MinHashLSHForest(num_perm=perms)
        print(forest)
    for i, m in enumerate(minhash):
        forest.add(i, m)

    forest.index()

    print('It took %s seconds to build forest.' % (time.time() - start_time))

    return forest
예제 #20
0
 def __datasketch_fit(self):
     if self.kwargs['create']:
         # Create a list of MinHash objects
         min_hash_obj_list = []
         forest = MinHashLSHForest(num_perm=self.kwargs['num_perm'])
         for i in range(len(self.features)):
             min_hash_obj_list.append(
                 MinHash(num_perm=self.kwargs['num_perm']))
             for d in self.features[i]:
                 min_hash_obj_list[i].update(d)
             forest.add(i, min_hash_obj_list[i])
         # IMPORTANT: must call index() otherwise the keys won't be searchable
         forest.index()
         with open(self.kwargs['file_path'], "wb") as f:
             pickle.dump(forest, f)
             pickle.dump(min_hash_obj_list, f)
         self.predictor = [forest, min_hash_obj_list]
     else:
         with open(self.kwargs['file_path'], "rb") as f:
             forest = pickle.load(f)
             min_hash_obj_list = pickle.load(f)
             self.predictor = [forest, min_hash_obj_list]
예제 #21
0
def get_forest(data, perms):
    start_time = time.time()

    minhash = []

    for text in data:
        tokens = p.preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf-8'))
        minhash.append(m)

    forest = MinHashLSHForest(num_perm=perms)

    for i, m in enumerate(minhash):
        forest.add(i, m)

    forest.index()

    print('time to build forest: ', (time.time() - start_time))

    return forest
예제 #22
0
class DataSketch(BaseANN):
    def __init__(self, metric, n_perm, n_rep):
        if metric not in ('jaccard'):
            raise NotImplementedError("Datasketch doesn't support metric %s" % metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)

    def fit(self, X):
        self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm = self._n_perm)
            for e in x:
                m.update(str(e))
            self._index.add(str(i), m)
        self._index.index()

    def query(self, v, n):
        m = MinHash(num_perm = self._n_perm)
        for e in v:
            m.update(str(e))
        return map(int, self._index.query(m, n))
예제 #23
0
    'the', 'similarity', 'between', 'documents'
]

dataset = [[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.]]
# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

for i, data in enumerate(dataset):
    m = MinHash(num_perm=128)
    for d in data:
        m.update(str(d).encode('utf8'))
    forest.add(str(i), m)

# IMPORTANT: must call index() otherwise the keys won't be searchable

pickle.dump(forest, open('forest.lsh', 'wb'))
del forest
forest = pickle.load(open('forest.lsh', 'rb'))

forest.index()

# Check for membership using the key
print("1" in forest)
print("2" in forest)

m = MinHash(num_perm=128)
for d in dataset[0]:
    m.update(str(d).encode('utf8'))
# Using m1 as the query, retrieve top 2 keys that have the higest Jaccard
result = forest.query(m, 10)
print("Top 2 candidates", result)
예제 #24
0
def main():
    corpus = {}
    with open('corpus_data/preprocessedf_corpus.json') as file:
        corpus = json.loads(file.read().encode('Utf-8'))

    def processLyrics(lyrics):
        authors = {}
        for author in lyrics:
            for song in lyrics[author]:
                lyric = re.sub(r'\[[^>]+\]', '', song["lyrics"])
                lyric = re.sub(r'\([^>]+\)', '', lyric)
                lyric = re.sub(r'\{[^>]+\}', '', lyric)
                lyric = lyric.split(r'\s')
                for line in lyric:
                    line = re.sub(r'\n', ' ', line)
                    if author not in authors:
                        authors[author] = line
                    else:
                        authors[author] += line
        return authors

    import nltk
    from nltk.corpus import stopwords
    from collections import defaultdict
    from collections import Counter

    nltk.download('wordnet')
    from nltk.corpus import wordnet as wn

    def get_lemma(word):
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma

    from nltk import word_tokenize

    def clean_text(text, ar):
        tokenized_text = word_tokenize(text.lower())
        tokenized_text = [token for token in tokenized_text if len(token) > 5]
        cleaned_text = [
            t for t in tokenized_text
            if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
        ]
        if ar == 'sw':
            cleaned_text = [t for t in cleaned_text if t not in STOPWORDS]
        if ar == 'lm':
            cleaned_text = [get_lemma(token) for token in cleaned_text]
        if ar == 'rw':
            cleaned_text = [
                token for token in cleaned_text if token not in PROFANITY
            ]
        return cleaned_text

    STOPWORDS = set(stopwords.words('english'))
    with open('corpus_data/rapsw.txt') as infile:
        infile = infile.read()
        infile = infile.split()
        PROFANITY = set(infile)

    corpus = processLyrics(corpus)

    for author, text in corpus.items():
        corpus[author] = clean_text(text, sys.argv[1])

    artist_shingle = defaultdict(list)
    for artist, lyrics in corpus.items():
        #tokens = [w for w in tokens if not w in sw]
        #shingle3 = set([tuple(tokens[i:i+3]) for i in range(len(tokens) - 3 + 1) if len(tokens[i]) < 10])
        #shingle2 = set([tuple(tokens[i:i+2]) for i in range(len(tokens) - 2 + 1) if len(tokens[i]) < 10])
        shingle1 = lyrics
        # set([tokens[i] for i in range(len(tokens) - 1 + 1) if len(tokens[i]) < 4])
        artist_shingle[artist].append(shingle1)
        #artist_shingle[artist].append(shingle2)
        #artist_shingle[artist].append(shingle3)

    from datasketch import MinHashLSHForest, MinHash
    from sklearn.metrics import jaccard_similarity_score

    listlsh = []
    lsh = MinHashLSHForest(num_perm=128)
    for artist, sets in artist_shingle.items():
        a = MinHash(num_perm=128)
        for d in sets[0]:
            a.update(d.encode('utf8'))
        listlsh.append(a)
        lsh.add(artist, a)

    lsh.index()

    m1 = MinHash(num_perm=128)
    g = []
    with open(sys.argv[2]) as g:
        g = g.read()
        g = g.split()
    for d in g:
        m1.update(d.encode('utf8'))

    result = lsh.query(m1, 5)
    print(" (Up to) Top 5 candidates", result)
예제 #25
0
def printStats(json_filename):
    with open(json_filename) as json_data:
        d = json.load(json_data)

        # Query simple index: queryNum -> queryText
        queryIndex = {}

        # Index of queries as a LSH forest for top-k similar queries.
        queriesLSHIndex = MinHashLSHForest(num_perm=128)

        # You can grok the CSV from stdout by using cut, e.g.,
        #
        # $ python analyzer.py -i ../../data/queries_ASTs.json | grep "csv:" | cut -d':' -f2 > /tmp/out.csv
        print 'csv:"queryNum","numExplicitJoins","referencedTables","groupByColumns","numGroupByClauses"'

        for queryNum, entry in enumerate(d):
            print '\n=> Stats for query number \"%s:\"' % queryNum

            # Group by clauses.
            groupByColumns = jmespath.search(
                'ast.statement[*].group.expression[*].name[]', entry)
            print 'groupBy columns: %s' % groupByColumns

            # Base tables when the query has no joins.
            baseTables = jmespath.search(
                'ast.statement[?from.variant == \'table\'].from.name[]', entry)
            print 'baseTables: %s' % baseTables

            # Base tables when the query has joins.
            baseTables += jmespath.search(
                'ast.statement[?from.variant == \'join\'].from.source.name[]',
                entry)
            print 'baseTables (with joins): %s' % baseTables

            # Join tables.
            joinTables = jmespath.search(
                'ast.statement[?from.variant == \'join\'].from.map[*].source.name[]',
                entry)
            print 'joinTables: %s' % joinTables

            # All tables mentioned in the query
            referencedTables = baseTables + joinTables

            # Joins.
            joinPathPrefix = 'ast.statement[*].from.map[*].constraint.on'
            joinsLeft = jmespath.search(joinPathPrefix + '.left.name', entry)
            joinsRight = jmespath.search(joinPathPrefix + '.right.name', entry)
            print 'explicit joins (left-hand side): %s' % joinsLeft
            print 'explicit joins (right-hand side): %s' % joinsRight

            # Text
            queryText = jmespath.search('queryText', entry)

            # Index it into an LSH forest for top-k textually similar queries.
            queryLSH = getQueryMinHash(queryText)
            queryIndex[queryNum] = {
                'queryText': queryText,
                'queryLSH': queryLSH
            }
            queriesLSHIndex.add(queryNum, queryLSH)

            # Sort for a prettier CSV dump.
            referencedTables.sort()
            groupByColumns.sort()
            # CSV header:
            # queryNum,numExplicitJoins,referencedTables,groupByColumns,numGroupByColumns
            print 'queryNum = %s' % queryNum
            print 'csv:"%s","%s","%s","%s","%s"' % (
                queryNum, len(joinsLeft[0]) if len(joinsLeft) > 0 else 0,
                ','.join(referencedTables), ','.join(groupByColumns),
                len(groupByColumns))

            # Populate a reverse index from table to script.
            tableToQuery = {}
            for referencedTable in referencedTables:
                if referencedTable not in tableToQuery:
                    tableToQuery[referencedTable] = [queryNum]
                else:
                    tableToQuery[referencedTable].append(queryNum)

        # Sample search on LSH forest index: top-3 most similar queries.
        queriesLSHIndex.index()
        k = 3
        queryNum = 10
        query = queryIndex[queryNum]
        print '\n\nTop %s queries similar to "%s":' % (k, query['queryText'])
        top_k = queriesLSHIndex.query(query['queryLSH'], k)
        for k in top_k:
            print '\n"%s"' % queryIndex[k]['queryText']
class LshSamplesEval:

	'''STUDENT_PATH = '../studentData/liftoff/'
	STANDARD_PATH = '../data/raw/liftoff/standard/'
	UNIFORM_PATH = '../data/raw/liftoff/uniform/'
	TEMPERED_PATH = '../data/raw/liftoff/tempered/'''

	def __init__(self, studentDataPath, sampledDataPath):
		print('Loading data...')
		self.studentDataPath = studentDataPath
		print(self.studentDataPath)
		self.sampledDataPath = sampledDataPath
		print(self.sampledDataPath)
		self.sampledData = self.loadSyntheticData()
		self.studentData = self.loadStudentData()

	def loadStudentData(self):
		path = self.studentDataPath + STUDENT_NAME
		datadict = pickle.load(open(path, "rb" ))
		return list(datadict.keys())

	def loadSyntheticData(self):
		standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME
		uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME
		tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME
		standardDict = pickle.load(open(standard_path, "rb" ))
		uniformDict = pickle.load(open(uniform_path, "rb" ))
		temperedDict =  pickle.load(open(tempered_path, "rb" ))
		#import pdb; pdb.set_trace()
		return list(standardDict.values()) + list(uniformDict.values()) + list(temperedDict.values())

	def computeLshNN(self):
		print('Processing sampled and student data...')
		sampledSets = self.processData(self.sampledData)
		studentSets = self.processData(self.studentData)

		print('Finding nearest neighbors from sampled data...')
		sampledScores = self.constructNNList(studentSets, sampledSets, self.studentData, self.sampledData)

		print('Found nearest neighbors for data!')

		# self.constructHistogram(sampledScores)

		return sampledScores

	# tokenize every sentence and return a list of sentences
	def processData(self, dataset):
		processed = []
		for datum in dataset:
			splitCode = datum.split()
			processed.append(splitCode)
		return processed

	# runs MinHashLsh
	def constructNNList(self, studentSets, sampledSets, studentData, sampledData):
		print('Creating min-hashes for student data')
		self.studentMinHashes = self.createMinHash(studentSets)
		print('Creating min-hashes for rubric data')
		self.sampledMinHashes = self.createMinHash(sampledSets)

		self.forest = MinHashLSHForest(num_perm = 128)
		i = 0
		for minHash in self.sampledMinHashes:
			self.forest.add(str(i), minHash)
			i += 1

		self.forest.index()

		print("calculating nearest neighbor")
		scores = []
		for i, query in enumerate(tqdm(self.studentMinHashes)):
			result = self.forest.query(query, 1)
			indexMatch = int(result[0])
			# Uncomment these to print examples of 
			# student code and their nearest neighbor!
			print(result)
			print('Student Code: \n')
			print(studentData[i])
			print('\n')
			print('Closest Sampled Code: \n')
			print(sampledData[indexMatch])
			print('\n')
			score = self.sampledMinHashes[indexMatch].jaccard(query)
			print('Score: \n')

			scores.append(score)

		return scores

	# create minHash objects for every dataset
	def createMinHash(self, dataset):
		minHashes = []
		for code in tqdm(dataset):
			minHash = MinHash(num_perm = 128)
			for d in code: # TODO modify this for n-grams
				minHash.update("".join(d).encode('utf-8'))
			minHashes.append(minHash)
		return minHashes

	def constructHistogram(self, scores):
		plt.hist(scores)
		plt.xlabel('Jaccard Similarity Score')
		plt.ylabel('Counts')
		plt.show()
예제 #27
0
class HashJaccard(FilterProblem):
    """
  A class that does clustering based on hashes from the datasketch library.
  """
    @property
    def num_perm(self):
        return DATA_FILTERING["num_permutations"]

    @property
    def DataPointClass(self):
        return DataPoint

    # Find nearest medoid for a data point.
    def find_nearest_medoid(self, data_point, data_tag=""):
        nearest_medoid = self.forest.query(data_point.min_hash, 1)
        if not nearest_medoid:
            nearest_medoid = [
                random.randint(0, self.num_clusters[data_tag] - 1)
            ]
        return nearest_medoid[0]

    # Do the clustering of sources and targets.
    def clustering(self, data_tag):
        """
    Params:
      :data_tag: Whether it's source or target data.
    """

        # Create a min hash forest to quickly find nearest neighbours.
        self.forest = MinHashLSHForest(num_perm=self.num_perm)

        # Initialize clusters.
        medoids = random.sample(range(len(self.data_points[data_tag])),
                                self.num_clusters[data_tag])

        for i in range(self.num_clusters[data_tag]):
            cl = self.ClusterClass(self.data_points[data_tag][medoids[i]])
            self.clusters[data_tag].append(cl)

            # Put medoids in a the forest.
            self.forest.add(i, self.clusters[data_tag][-1].medoid.min_hash)
        self.forest.index()

        # For each data_point find a cluster.
        self.cluster_points(data_tag)

        # These will be needed for the stopping criterion.
        cluster_names = [
            self.clusters[data_tag][i].medoid.string
            for i in range(self.num_clusters[data_tag])
        ]
        cluster_names_old = list(cluster_names)
        count = 0
        counts = []
        exit = False

        # Clustering loop.
        while not exit:
            count += 1

            # Find the point that minimizes the mean distance within a cluster.
            self.find_medoid(data_tag)

            # Create new forest.
            self.forest = MinHashLSHForest(num_perm=self.num_perm)
            for i in range(self.num_clusters[data_tag]):
                self.forest.add(i, self.clusters[data_tag][i].medoid.min_hash)
            self.forest.index()

            # Assign each point to the new medoids.
            self.cluster_points(data_tag)

            # Check stopping criterions.
            exit, cluster_names, cluster_names_old, counts = self.stop_clustering(
                data_tag, cluster_names, cluster_names_old, count, counts)
예제 #28
0
    def saver(self, i, q, retq, matq, l):
        print_start = t.time()
        save_start = t.time()
        global_time = t.time()
        chunk_size = 100
        count = 0
        forest = MinHashLSHForest(num_perm=self.numperm)

        taxstr = ''
        if self.tax_filter is None:
            taxstr = 'NoFilter'
        if self.tax_mask is None:
            taxstr += 'NoMask'
        else:
            taxstr = str(self.tax_filter)
        dataset_name = self.saving_name + '_' + taxstr
        self.errorfile = self.saving_path + 'errors.txt'
        with open(self.errorfile, 'w') as hashes_error_files:
            with h5py.File(self.hashes_path, 'w', libver='latest') as h5hashes:
                datasets = {}
                if dataset_name not in h5hashes.keys():
                    if self.verbose == True:
                        print('creating dataset')
                        print(dataset_name)
                        print('filtered at taxonomic level: ' + taxstr)
                    h5hashes.create_dataset(dataset_name + '_' + taxstr,
                                            (chunk_size, 0),
                                            maxshape=(None, None),
                                            dtype='int32')
                    datasets[dataset_name] = h5hashes[dataset_name + '_' +
                                                      taxstr]
                    if self.verbose == True:
                        print(datasets)
                    h5flush = h5hashes.flush
                print('saver init ' + str(i))
                while True:
                    this_dataframe = retq.get()
                    if this_dataframe is not None:
                        if not this_dataframe.empty:
                            hashes = this_dataframe['hash'].to_dict()
                            print(str(this_dataframe.Fam.max()) + 'fam num')
                            print(str(count) + ' done')
                            hashes = {
                                fam: hashes[fam]
                                for fam in hashes if hashes[fam]
                            }
                            [
                                forest.add(str(fam), hashes[fam])
                                for fam in hashes
                            ]
                            for fam in hashes:
                                if len(datasets[dataset_name]) < fam + 10:
                                    datasets[dataset_name].resize(
                                        (fam + chunk_size,
                                         len(hashes[fam].hashvalues.ravel())))
                                datasets[dataset_name][
                                    fam, :] = hashes[fam].hashvalues.ravel()
                                count += 1
                            if t.time() - save_start > 200:
                                print(t.time() - global_time)
                                forest.index()
                                print(forest.query(hashes[fam], k=10))
                                h5flush()
                                save_start = t.time()
                                with open(self.lshforestpath,
                                          'wb') as forestout:
                                    forestout.write(pickle.dumps(forest, -1))
                                if self.verbose == True:
                                    print('save done at' +
                                          str(t.time() - global_time))
                        else:
                            print(this_dataframe)
                    else:
                        if self.verbose == True:
                            print('wrap it up')
                        with open(self.lshforestpath, 'wb') as forestout:
                            forestout.write(pickle.dumps(forest, -1))
                        h5flush()
                        if self.verbose == True:
                            print('DONE SAVER' + str(i))
                        break
예제 #29
0
class AutoTag():

    def __init__(self, num_permutation=60):
        self.__num_permutation = num_permutation
        self.__forest = MinHashLSHForest(self.__num_permutation)
        self.__lem = WordNetLemmatizer()
        stop_words = set(stopwords.words("english"))
        stop_words.add('—')
        stop_words.add('And')
        self.__stop_words = stop_words

    def fit(self, csv):
        df = pd.read_csv(csv)
        df.drop_duplicates(subset='webURL', keep=False, inplace=True)
        df.dropna(inplace=True)
        for index, row in df.iterrows():
            min_hash = self.make_min_hash(self.make_clean_words_list(row['Text']))
            self.__forest.add(row['webURL'], min_hash)
            if index % 100 == 0 :print(index, end='\r', flush=True)
        self.__forest.index()


    def make_clean_words_list(self, text):
        text = re.sub('[^a-zA-Z]', ' ', text)

        #Convert to lowercase
        text = text.lower()

        #remove tags
        text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

        # remove special characters and digits
        text=re.sub("(\\d|\\W)+"," ",text)

        #Lemmatisation
        text = text.split()
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in self.__stop_words]

        return text


    def predict(self, text, num_of_niebhors):
        #TODO : change results into tags
        query = self.make_min_hash(self.make_clean_words_list(text))
        return self.__forest.query(query, num_of_niebhors)




    def make_min_hash(self,words):
        min_hash = MinHash(self.__num_permutation)
        for word in words:
            min_hash.update(word.encode('utf8'))
        return min_hash


    def load_trained_model(self, trained_model_file_name, num_of_permutations):
        self.__forest = pickle.load(open(trained_model_file_name, 'rb'))
        self.__num_permutation = num_of_permutations

    def save_model(self, file_name):
        pickle.dump(self.__forest, open(file_name, 'wb'))
예제 #30
0
class LshNN(ProgramNN):
	CACHE_DIR = 'cache/'

	def __init__(self, sampledDataPath, num_perm=128, top_k=1, evict_cache=False):
		"""
		An agent class to find rubric sampled nearest neighbour of a given
		program by using a MinHash LSH forest.

		"""
		self.sampledDataPath = sampledDataPath
		self.num_perm = num_perm
		self.top_k = top_k
		self.evict_cache = evict_cache
		self.rawProgramData, self.sampledData = self.loadSyntheticData()
		self.create_lsh_forest()


	def create_lsh_forest(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_forest.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			# load precomputed
			print('Loading cached forest')
			self.forest = load_pickle(cache_file)
		else:
			sampledSets = self.processData(self.sampledData)
			self.sampledMinHashes = self.createMinHashSet(sampledSets)

			self.forest = MinHashLSHForest(num_perm=self.num_perm)
			for prog_idx, minHash in enumerate(self.sampledMinHashes):
				self.forest.add(prog_idx, minHash)

			self.forest.index()

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_pickle(self.forest, cache_file)

	def minHash(self, code_tokens):
		minHash = MinHash(num_perm=self.num_perm)
		for d in code_tokens: # TODO modify this for n-grams
			minHash.update("".join(d).encode('utf-8'))

		return minHash

	# create minHash objects for every dataset
	def createMinHashSet(self, dataset):
		minHashes = []
		for code in tqdm(dataset):
			minHashes.append(self.minHash(code))
		return minHashes

	def multi_dict_get(self, key, all_dicts):
		for dic in all_dicts:
			if key in dic:
				return dic[key]
		raise ValueError('Key not in any of the dictionaries')

	def loadSyntheticData(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_programs.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			data = load_json(cache_file)
			prog_items = data['raw_programs']
			anon_progs = data['anon_programs']
		else:
			standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME
			uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME
			tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME
			standardDict = pickle.load(open(standard_path, "rb" ))
			uniformDict = pickle.load(open(uniform_path, "rb" ))
			temperedDict =  pickle.load(open(tempered_path, "rb" ))

			all_dicts = [standardDict, uniformDict, temperedDict]

			# this step is not stable across different runs if caching forest
			# so this needs to be cached too
			prog_items = list(standardDict.keys() | uniformDict.keys() | temperedDict.keys())
			anon_progs = [self.multi_dict_get(prog, all_dicts) for prog in prog_items]
			data = dict(raw_programs=prog_items, anon_programs=anon_progs)

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_json(data, cache_file)

			# if we dont load cache here, we should regenerate forest too
			self.evict_cache = True

		return prog_items, anon_progs



	def transformCode(self, program):
		splitCode = program.split()
		return splitCode
		#return ngrams(splitCode, 3)

	# tokenize every sentence and return a list of sentences
	def processData(self, dataset):
		processed = []
		for datum in dataset:
			transformedCode = self.transformCode(datum)
			processed.append(transformedCode)
		return processed

	def findNearestNeighbours(self, studentProgram, **kwargs):
		minHash = self.minHash(self.transformCode(studentProgram))
		result = self.forest.query(minHash, self.top_k)
		top_k_programs_anon = [self.sampledData[idx] for idx in result]
		top_k_programs = [self.rawProgramData[idx] for idx in result]
		#return top_k_programs, top_k_programs_anon
		return top_k_programs
예제 #31
0
from datasketch import MinHashLSHForest, MinHash
from sklearn.metrics import jaccard_similarity_score

g = []

listlsh = []
lsh = MinHashLSHForest(num_perm=128)
for artist,sets in artist_shingle.items():
    a = MinHash(num_perm=128)
    for d in sets[0]:
        a.update(d.encode('utf8'))
    listlsh.append(a)
    lsh.add(artist,a)

lsh.index()
tester = {}
with open('tester.json') as file:
    tester = json.loads(file.read().encode('latin-1'))
numcorrect_1 =0
numcorrect_5 = 0
numcorrect_10 = 0
total = 0
for artist,songlist in tester.items():
    for song in songlist:
        m1 = MinHash(num_perm=128)
        songp = clean_text(song['lyrics'])
        for d in songp:
            m1.update(d.encode('utf8'))
        result = lsh.query(m1, 10)
        if len(result):