Пример #1
0
def test_similarity_of_two_sets_using_w_shingles():

    print ".....Testing w-shingles (shingling, minhash & calc jaccard similarity)\n"

    min_values_list_w_shingles = None
    for shingle, original_document in shingle_generator(faux_generator_string_words(), type=ShingleType.W_SHINGLES):
        print shingle
        min_values_list_w_shingles = run(shingle)
        print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles))
        print min_values_list_w_shingles
        print

    min_values_list_w_shingles_2 = None
    for shingle, original_document in shingle_generator(faux_generator_string_words_2(), type=ShingleType.W_SHINGLES):
        print shingle
        min_values_list_w_shingles_2 = run(shingle)
        print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles_2))
        print min_values_list_w_shingles_2
        print

    # calculate jaccard similarity - should be approx 44% similar
    similarity_ratio = jaccard_similarity(set(min_values_list_w_shingles), set(min_values_list_w_shingles_2))
    print "Asserting jaccard similarity should be ~44%\n"

    assert similarity_ratio >= .44
    def post(self):
        try:
            self.open()
            self.tweets = []
            self.lshj = LshJaccard(num_bands=20, rows_per_band=10)
        except:
            self.session['tw_auth'] = None
            self.redirect('/')
            return

        while True:
            try:
                for shingles_list, original_document in shingle_generator(self.get_next()):
                    # get minhash signatures for each shingle list
                    min_hash_signatures = minhash.run(shingles_list)

                    #create document and run LSH for Jaccard Distance
                    doc_obj = JaccardDocument(original_document, shingles_list, min_hash_signatures)

                    logging.info('Running Jaccard LSH Current Tweet: %s', original_document)

                    results = self.lshj.run(doc_obj)
                    if results:
                        logging.info('.....RESULTS.....')
                        logging.info('.....score: %s', str(results['score']))
                        logging.info('.....match_found: %s', str(results['match_found']))
                        logging.info(results['document_1'])
                        logging.info(results['document_2'])
                        logging.info('---------------------------------------------------')
                        logging.info('Results: %s', str(results['score']))

                        #TODO update the code the read this and prints out score, docs and match boolean flag
                        #self.tweets.append(str(results['score']))
            except NotFound as nf:
                logging.error('TwitterReadNode.GetNext completed, %s', nf.value)
                break

        self.close(save=True)