def test_similarity_of_two_sets_using_w_shingles(): print ".....Testing w-shingles (shingling, minhash & calc jaccard similarity)\n" min_values_list_w_shingles = None for shingle, original_document in shingle_generator(faux_generator_string_words(), type=ShingleType.W_SHINGLES): print shingle min_values_list_w_shingles = run(shingle) print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles)) print min_values_list_w_shingles print min_values_list_w_shingles_2 = None for shingle, original_document in shingle_generator(faux_generator_string_words_2(), type=ShingleType.W_SHINGLES): print shingle min_values_list_w_shingles_2 = run(shingle) print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles_2)) print min_values_list_w_shingles_2 print # calculate jaccard similarity - should be approx 44% similar similarity_ratio = jaccard_similarity(set(min_values_list_w_shingles), set(min_values_list_w_shingles_2)) print "Asserting jaccard similarity should be ~44%\n" assert similarity_ratio >= .44
def post(self): try: self.open() self.tweets = [] self.lshj = LshJaccard(num_bands=20, rows_per_band=10) except: self.session['tw_auth'] = None self.redirect('/') return while True: try: for shingles_list, original_document in shingle_generator(self.get_next()): # get minhash signatures for each shingle list min_hash_signatures = minhash.run(shingles_list) #create document and run LSH for Jaccard Distance doc_obj = JaccardDocument(original_document, shingles_list, min_hash_signatures) logging.info('Running Jaccard LSH Current Tweet: %s', original_document) results = self.lshj.run(doc_obj) if results: logging.info('.....RESULTS.....') logging.info('.....score: %s', str(results['score'])) logging.info('.....match_found: %s', str(results['match_found'])) logging.info(results['document_1']) logging.info(results['document_2']) logging.info('---------------------------------------------------') logging.info('Results: %s', str(results['score'])) #TODO update the code the read this and prints out score, docs and match boolean flag #self.tweets.append(str(results['score'])) except NotFound as nf: logging.error('TwitterReadNode.GetNext completed, %s', nf.value) break self.close(save=True)