예제 #1
0
def copy_db(biz_id, cid):
    '''
    Similar functionality to select_db, except we do the on-the-fly loading for
    the whole dataset.
    '''
    if biz_id == 'all':
        return ReviewDB.load(all_clusters_path, all_centroids_path)
    else:
        db_biz = ReviewDB.load(hotel_attr_path(biz_id), hotel_centroids_path(biz_id))
        return db_biz
예제 #2
0
    def test_nlplength_funcs(self):
        db = ReviewDB.load(cluster_file='tests/testing_db.csv')
        nlp = NLPLengths(db)
        #Test empty set
        empty1 = nlp.word_token_review_length_counter([])
        self.assertEqual(empty1, (Counter(), 0, 0, 0, 0))

        #Test word_token_review_length_counter
        word_result_zero = nlp.word_token_review_length_counter(0)
        print(word_result_zero)
        self.assertEqual(word_result_zero, (Counter({"12": 1}), 12.0, 12, (12, 1), 0.0))
        word_result_cluster = nlp.word_token_review_length_counter('1-2-1-0-0')
        print(word_result_cluster)
        self.assertEqual(word_result_cluster, (Counter({"22": 1, "7": 1, "6": 1}), 11.666666666666666, 7, (6, 1), 8.962886439832502))
        #Test sent_token_review_length_counter
        sent_result_zero = nlp.sent_token_review_length_counter(0)
        print(sent_result_zero)
        self.assertEqual(sent_result_zero, (Counter({"1": 1}), 1.0, 1, (1, 1), 0.0))
        sent_result_cluster = nlp.sent_token_review_length_counter('1-2-1-0-0')
        print(sent_result_cluster)
        self.assertEqual(sent_result_cluster, (Counter({"1": 2, '3':1}), 1.6666666666666667, 1, (1, 2),  1.1547005383792515))
        #Test char_review_length_counter
        char_result_zero = nlp.char_review_length_counter(0)
        print(char_result_zero)
        self.assertEqual(char_result_zero, (Counter({"53": 1}), 53.0, 53, (53,1), 0.0))
        char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0')
        print(char_result_cluster)
        self.assertEqual(char_result_cluster, (Counter({"101": 1, "31": 1, "30": 1}), 54.0, 31, (30, 1), 40.70626487409524))
        #Test Counter behavior when querying using a value not in the keys
        self.assertEqual(sent_result_cluster[0]['0'], 0)
예제 #3
0
def cluster_topwords():
    # detail['topwords'] = tfidf_model.top_k(cid, 5)
    biz_id = request.args.get('biz_id')
    cid1 = request.args.get('cid1')
    cid2 = request.args.get('cid2')
    ngramsize = int(request.args.get('ngramsize'))
    #fixed = bool(int(request.args.get('fixed')))
    fixed=False
    cur_tfidf_model = tfidf_model
    if ngramsize == 2:
        cur_tfidf_model = tfidf_model_2g
    if biz_id != 'all':
        db_biz = ReviewDB.load(cluster_file = hotel_attr_path(biz_id))
        cur_tfidf_model = TfidfModel.TFIDFModel(db_biz, ngramsize)

    topwords = None
    # [Xiong] Revise this part to make it more efficient!
    # Right now it's just a simple and dirty hack.
    if cid2 != None:
        if fixed:
            topwords = cur_tfidf_model.compare_fixed_set(cid1, cid2)
        else:
            topwords = [cur_tfidf_model.top_k(cid1), cur_tfidf_model.top_k(cid2)]
    else:
        topwords = [cur_tfidf_model.top_k(cid1)]

    res = Response(json.dumps(topwords), status = 200, mimetype = 'application/json')
    res.headers.add('Access-Control-Allow-Origin', '*')
    return res
예제 #4
0
 def test_density_estimator(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     nlp = NLPLengths(db)
     histogram_comparison = HistogramComparison()
     char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0')
     histogram = char_result_cluster[0]
     density_estimate = histogram_comparison.density_estimator(histogram)
     self.assertEqual(sum(density_estimate.values()), 1.0)
예제 #5
0
def select_reviews_by_layer(biz_id, cid):
    '''
    Pick reviews for a certain layer under a certain entity (or all entities)
    Args:
        biz_id: id for locating an entity (e.g. hotel)
        cid: id for locating a cluster
    Returns:
        sub dataframes for the specified cluster
    '''
    if biz_id == 'all':
        return db_all.get_review_from_id(cid)
    else:
        db_biz = ReviewDB.load(hotel_attr_path(biz_id), hotel_centroids_path(biz_id))
        return db_biz.get_review_from_id(cid)
예제 #6
0
def select_db(biz_id):
    '''
    Return a review_db obejct based on biz_id. The reason we have this function
    is to avoid loading a whole dataset on the fly, which can take a lot of time.
    Loading for only one entity is fairly fast.
    Args:
        biz_id: 'all' or actual entity id
    Returns:
        reference to the db object for all reviews or for one specific entity
    '''
    if biz_id == 'all':
        return db_all
    else:
        db_biz = ReviewDB.load(hotel_attr_path(biz_id), hotel_centroids_path(biz_id))
        return db_biz
예제 #7
0
 def test_TFIDF_funcs(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     tfidf = TFIDFModel(db)
     #test tfidf.tfidf_score(), which also calls tfidf.scores_to_counter()
     tfidf_zero = tfidf.tfidf_score(0, ['wharf'])
     self.assertTrue('wharf' in tfidf_zero.keys())
     self.assertFalse('banana' in tfidf_zero.keys())
     tfidf_cluster = tfidf.tfidf_score('1-2-1-0-0', ['towels', 'unwelcome', 'charge', 'wharf'])
     self.assertGreater(tfidf_cluster['towels'], 0)
     self.assertGreater(tfidf_cluster['unwelcome'], 0)
     self.assertGreater(tfidf_cluster['charge'], 0)
     self.assertEqual(tfidf_cluster['wharf'], 0.0)
     #test tfidf.top_k(), which also calls tfidf.scores_to_counter()
     top_for_zero = tfidf.top_k(0)
     self.assertTrue('wharf' in top_for_zero.keys())
     top_for_cluster = tfidf.top_k('1-2-1-0-0')
     self.assertTrue('towels' in top_for_cluster.keys())
     self.assertTrue('charge' in top_for_cluster.keys())
     self.assertFalse('wharf' in top_for_cluster.keys())
     #test tfidf.compare_top_k()
     group1, group2 = tfidf.compare_top_k(0, '1-2-1-0-0')
     #test combination of keys
     compare_top_k_test1 = True
     compare_top_k_test2 = True
     #test key values
     compare_top_k_test3 = True
     compare_top_k_test4 = True
     for key in top_for_cluster.keys():
         if key not in group1.keys() or key not in group2.keys():
             compare_top_k_test1 = False
             break
         if group2[key] != top_for_cluster[key]:
             compare_top_k_test3 = False
             break
     for key in top_for_zero.keys():
         if key not in group2.keys() or key not in group1.keys():
             compare_top_k_test2 = False
             break
         if group1[key] != top_for_zero[key]:
             print(key, ' ', group1[key], ' ', top_for_zero[key])
             compare_top_k_test4 = False
             break
     self.assertTrue(compare_top_k_test1)
     self.assertTrue(compare_top_k_test2)
     self.assertTrue(compare_top_k_test3)
     self.assertTrue(compare_top_k_test4)
     self.assertEqual(group2['wharf'], 0.0)
예제 #8
0
 def test_sorensen(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     nlp = NLPLengths(db)
     histogram_comparison = HistogramComparison()
     histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0]
     compare_self = histogram_comparison.sorensen(histogram1, histogram1)
     self.assertEqual(compare_self, 0.0)
     trivial_histogram1 = Counter({"1": 1})
     trivial_histogram2 = Counter({"1": 0})
     compare_trivial = histogram_comparison.sorensen(
         trivial_histogram1, trivial_histogram2)
     self.assertEqual(compare_trivial, 1.0)
     more_complicated_histogram1 = Counter({"1": 1, "2": 2})
     more_complicated_histogram2 = Counter({"2": 3, "3": 4})
     compare_more_complicated = histogram_comparison.sorensen(
         more_complicated_histogram1, more_complicated_histogram2)
     self.assertLess((compare_more_complicated - 0.66667), .001)
예제 #9
0
 def test_hellinger(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     nlp = NLPLengths(db)
     histogram_comparison = HistogramComparison()
     histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0]
     compare_self = histogram_comparison.hellinger(histogram1, histogram1)
     self.assertEqual(compare_self, 0.0)
     trivial_histogram1 = Counter({"1": 1})
     trivial_histogram2 = Counter({"1": 0})
     compare_trivial = histogram_comparison.hellinger(
         trivial_histogram1, trivial_histogram2)
     self.assertEqual(compare_trivial, 0.7071067811865475)
     more_complicated_histogram1 = Counter({"1": 1, "2": 2})
     more_complicated_histogram2 = Counter({"2": 3, "3": 4})
     compare_more_complicated = histogram_comparison.hellinger(
         more_complicated_histogram1, more_complicated_histogram2)
     self.assertLess((compare_more_complicated - 0.6822591268536838), .001)
예제 #10
0
 def test_nlplength_init(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     nlp = NLPLengths(db)
     self.assertFalse(nlp is None)
예제 #11
0
 def test_tfidf_bigram(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     tfidf = TFIDFModel(db, ngramsize=2)
     tfidf_zero = tfidf.tfidf_score(0, ["wharf rooms"])
     # print(tfidf_zero)
     self.assertTrue(("wharf rooms") in tfidf_zero.keys())
예제 #12
0
 def test_TFIDF_init(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     tfidf = TFIDFModel(db)
     self.assertFalse(tfidf is None)