示例#1
0
def test_similarity_of_two_sets_using_w_shingles():

    print ".....Testing w-shingles (shingling, minhash & calc jaccard similarity)\n"

    min_values_list_w_shingles = None
    for shingle, original_document in shingle_generator(faux_generator_string_words(), type=ShingleType.W_SHINGLES):
        print shingle
        min_values_list_w_shingles = run(shingle)
        print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles))
        print min_values_list_w_shingles
        print

    min_values_list_w_shingles_2 = None
    for shingle, original_document in shingle_generator(faux_generator_string_words_2(), type=ShingleType.W_SHINGLES):
        print shingle
        min_values_list_w_shingles_2 = run(shingle)
        print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles_2))
        print min_values_list_w_shingles_2
        print

    # calculate jaccard similarity - should be approx 44% similar
    similarity_ratio = jaccard_similarity(set(min_values_list_w_shingles), set(min_values_list_w_shingles_2))
    print "Asserting jaccard similarity should be ~44%\n"

    assert similarity_ratio >= .44
def test_shingle_generator_k_shingles_yield_list_of_strings(mock_k_shingles_gen):
    # set up
    type = ShingleType.K_SHINGLES
    size = 4

    faux_results = get_faux_list_of_k_shingles()
    faux_string_generator = generator_string()

    mock_k_shingles_gen.return_value = yield faux_results

    # execute
    actual_results = next(shgl.shingle_generator(faux_string_generator, size=size, type=type))

    # asserts
    mock_k_shingles_gen.assert_called_once_with(faux_string_generator, size)
    nt.eq_(actual_results, faux_results)
示例#3
0
def test_shingle_generator_k_shingles_yield_list_of_strings(
        mock_k_shingles_gen):
    # set up
    type = ShingleType.K_SHINGLES
    size = 4

    faux_results = get_faux_list_of_k_shingles()
    faux_string_generator = generator_string()

    mock_k_shingles_gen.return_value = yield faux_results

    # execute
    actual_results = next(
        shgl.shingle_generator(faux_string_generator, size=size, type=type))

    # asserts
    mock_k_shingles_gen.assert_called_once_with(faux_string_generator, size)
    nt.eq_(actual_results, faux_results)
    def post(self):
        try:
            self.open()
            self.tweets = []
            self.lshj = LshJaccard(num_bands=20, rows_per_band=10)
        except:
            self.session['tw_auth'] = None
            self.redirect('/')
            return

        while True:
            try:
                for shingles_list, original_document in shingle_generator(self.get_next()):
                    # get minhash signatures for each shingle list
                    min_hash_signatures = minhash.run(shingles_list)

                    #create document and run LSH for Jaccard Distance
                    doc_obj = JaccardDocument(original_document, shingles_list, min_hash_signatures)

                    logging.info('Running Jaccard LSH Current Tweet: %s', original_document)

                    results = self.lshj.run(doc_obj)
                    if results:
                        logging.info('.....RESULTS.....')
                        logging.info('.....score: %s', str(results['score']))
                        logging.info('.....match_found: %s', str(results['match_found']))
                        logging.info(results['document_1'])
                        logging.info(results['document_2'])
                        logging.info('---------------------------------------------------')
                        logging.info('Results: %s', str(results['score']))

                        #TODO update the code the read this and prints out score, docs and match boolean flag
                        #self.tweets.append(str(results['score']))
            except NotFound as nf:
                logging.error('TwitterReadNode.GetNext completed, %s', nf.value)
                break

        self.close(save=True)
示例#5
0
def test_shingle_generator_invalid_shingle_type_raise_value_error():

    # execute
    next(shgl.shingle_generator(generator_words(), type="blah"))
def test_shingle_generator_invalid_shingle_type_raise_value_error():

    # execute
    next(shgl.shingle_generator(generator_words(), type="blah"))