def test_density_estimator(self): db = ReviewDB('tests/test_data/') nlp = NLPLengths(db.entity_db_dict['all']) histogram_comparison = HistogramComparison() char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0') histogram = char_result_cluster[0] density_estimate = histogram_comparison.density_estimator(histogram) self.assertEqual(sum(density_estimate.values()), 1.0)
def test_ReviewDB_init(self): ''' Test initialization of review ids equivalent to original indices as part of init for ReviewDB object. ''' clusters_df = pd.read_csv('tests/test_data/clusters.csv', index_col=0) db = ReviewDB('tests/test_data/') for i in range(0, 10): self.assertEqual( db.entity_db_dict['all'].clusters_df[ db.entity_db_dict['all'].clusters_df['review_id'] == i].index.item(), i)
def test_TFIDF_funcs(self): db = ReviewDB('tests/test_data/') tfidf = TFIDFModel(db.entity_db_dict['all']) #test tfidf.tfidf_score(), which also calls tfidf.scores_to_counter() tfidf_zero = tfidf.tfidf_score(0, ['wharf']) self.assertTrue('wharf' in tfidf_zero.keys()) self.assertFalse('banana' in tfidf_zero.keys()) tfidf_cluster = tfidf.tfidf_score( '1-2-1-0-0', ['towels', 'unwelcome', 'charge', 'wharf']) self.assertGreater(tfidf_cluster['towels'], 0) self.assertGreater(tfidf_cluster['unwelcome'], 0) self.assertGreater(tfidf_cluster['charge'], 0) self.assertEqual(tfidf_cluster['wharf'], 0.0) #test tfidf.top_k(), which also calls tfidf.scores_to_counter() top_for_zero = tfidf.top_k(0) self.assertTrue('wharf' in top_for_zero.keys()) top_for_cluster = tfidf.top_k('1-2-1-0-0') self.assertTrue('towels' in top_for_cluster.keys()) self.assertTrue('charge' in top_for_cluster.keys()) self.assertFalse('wharf' in top_for_cluster.keys()) #test tfidf.compare_top_k() group1, group2 = tfidf.compare_top_k(0, '1-2-1-0-0') #test combination of keys compare_top_k_test1 = True compare_top_k_test2 = True #test key values compare_top_k_test3 = True compare_top_k_test4 = True for key in top_for_cluster.keys(): if key not in group1.keys() or key not in group2.keys(): compare_top_k_test1 = False break if group2[key] != top_for_cluster[key]: compare_top_k_test3 = False break for key in top_for_zero.keys(): if key not in group2.keys() or key not in group1.keys(): compare_top_k_test2 = False break if group1[key] != top_for_zero[key]: print(key, ' ', group1[key], ' ', top_for_zero[key]) compare_top_k_test4 = False break self.assertTrue(compare_top_k_test1) self.assertTrue(compare_top_k_test2) self.assertTrue(compare_top_k_test3) self.assertTrue(compare_top_k_test4) self.assertEqual(group2['wharf'], 0.0)
def test_sorensen(self): db = ReviewDB('tests/test_data/') nlp = NLPLengths(db.entity_db_dict['all']) histogram_comparison = HistogramComparison() histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0] compare_self = histogram_comparison.sorensen(histogram1, histogram1) self.assertEqual(compare_self, 0.0) trivial_histogram1 = Counter({"1": 1}) trivial_histogram2 = Counter({"1": 0}) compare_trivial = histogram_comparison.sorensen( trivial_histogram1, trivial_histogram2) self.assertEqual(compare_trivial, 1.0) more_complicated_histogram1 = Counter({"1": 1, "2": 2}) more_complicated_histogram2 = Counter({"2": 3, "3": 4}) compare_more_complicated = histogram_comparison.sorensen( more_complicated_histogram1, more_complicated_histogram2) self.assertLess((compare_more_complicated - 0.66667), .001)
def test_hellinger(self): db = ReviewDB('tests/test_data/') nlp = NLPLengths(db.entity_db_dict['all']) histogram_comparison = HistogramComparison() histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0] compare_self = histogram_comparison.hellinger(histogram1, histogram1) self.assertEqual(compare_self, 0.0) trivial_histogram1 = Counter({"1": 1}) trivial_histogram2 = Counter({"1": 0}) compare_trivial = histogram_comparison.hellinger( trivial_histogram1, trivial_histogram2) self.assertEqual(compare_trivial, 0.7071067811865475) more_complicated_histogram1 = Counter({"1": 1, "2": 2}) more_complicated_histogram2 = Counter({"2": 3, "3": 4}) compare_more_complicated = histogram_comparison.hellinger( more_complicated_histogram1, more_complicated_histogram2) self.assertLess((compare_more_complicated - 0.6822591268536838), .001)
def test_nlplength_funcs(self): db = ReviewDB('tests/test_data/') nlp = NLPLengths(db.entity_db_dict['all']) #Test empty set empty1 = nlp.word_token_review_length_counter([]) self.assertEqual(empty1, (Counter(), 0, 0, 0, 0)) #Test word_token_review_length_counter word_result_zero = nlp.word_token_review_length_counter(0) print(word_result_zero) self.assertEqual(word_result_zero, (Counter({"12": 1}), 12.0, 12, (12, 1), 0.0)) word_result_cluster = nlp.word_token_review_length_counter('1-2-1-0-0') print(word_result_cluster) self.assertEqual(word_result_cluster, (Counter({ "22": 1, "7": 1, "6": 1 }), 11.666666666666666, 7, (6, 1), 8.962886439832502)) #Test sent_token_review_length_counter sent_result_zero = nlp.sent_token_review_length_counter(0) print(sent_result_zero) self.assertEqual(sent_result_zero, (Counter({"1": 1}), 1.0, 1, (1, 1), 0.0)) sent_result_cluster = nlp.sent_token_review_length_counter('1-2-1-0-0') print(sent_result_cluster) self.assertEqual(sent_result_cluster, (Counter({ "1": 2, '3': 1 }), 1.6666666666666667, 1, (1, 2), 1.1547005383792515)) #Test char_review_length_counter char_result_zero = nlp.char_review_length_counter(0) print(char_result_zero) self.assertEqual(char_result_zero, (Counter({"53": 1}), 53.0, 53, (53, 1), 0.0)) char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0') print(char_result_cluster) self.assertEqual(char_result_cluster, (Counter({ "101": 1, "31": 1, "30": 1 }), 54.0, 31, (30, 1), 40.70626487409524)) #Test Counter behavior when querying using a value not in the keys self.assertEqual(sent_result_cluster[0]['0'], 0)
from flask import send_from_directory from flask_cors import CORS from libs import nlp_length_functions from libs.histogram_comparisons import HistogramComparison from libs.review_db import ReviewDB # logging configurations logging.basicConfig(format='%(filename)s:%(lineno)d %(message)s') log = logging.getLogger(__name__) log.setLevel('INFO') # set up data access CONFIG = json.load(open("./../config.json")) data_folder = os.path.join(os.environ['DATA_DIR'], CONFIG['dataset']) schema = json.load(open(os.path.join(data_folder, 'schema.json')))['schema'] database = ReviewDB(data_folder) app = Flask(__name__, static_folder='./react-app/build/static', template_folder='./react-app/build') cors = CORS(app) app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0 app.config['TEMPLATES_AUTO_RELOAD'] = True histogram_comparison_utils = HistogramComparison() # [Xiong] endpoint for sending static files @app.route('/data/<path:subpath>') def data(subpath):
def test_ReviewDB_funcs(self): ''' Test functions in ReviewDB using a toy data set ''' db = ReviewDB('tests/test_data/') #Test empty empty1 = db.entity_db_dict['all'].decode_id([]) empty2 = db.entity_db_dict['all'].fetch_reviews([]) empty3 = db.entity_db_dict['all'].get_review_from_id([]) self.assertEqual(empty1, None) self.assertEqual(empty2, None) self.assertEqual(empty3, None) #Testing db.decode_id() decode1 = db.entity_db_dict['all'].decode_id(0) self.assertEqual(decode1, [0]) decode2 = db.entity_db_dict['all'].decode_id('1-2-1-0-0') self.assertEqual(decode2, [1, 2, 7]) decode3 = db.entity_db_dict['all'].decode_id('1-2-1-0') self.assertEqual(decode3, [1, 2, 7]) decode4 = db.entity_db_dict['all'].decode_id('1-2-1') self.assertEqual(decode4, [1, 2, 7]) decode5 = db.entity_db_dict['all'].decode_id('1-2') self.assertEqual(decode5, [1, 2, 7]) decode6 = db.entity_db_dict['all'].decode_id('4') self.assertEqual(decode6, [0, 5, 6, 8]) #Testing db.fetch_reviews() fetch1 = db.entity_db_dict['all'].fetch_reviews([0]) self.assertEqual(fetch1.iloc[[0]].author.values, 'guest1') fetch2 = db.entity_db_dict['all'].fetch_reviews([0, 3, 7]) self.assertEqual(fetch2.iloc[[0]].author.values, 'guest1') self.assertEqual(fetch2.iloc[[1]].author.values, 'guest4') self.assertEqual(fetch2.iloc[[2]].author.values, 'guest8') #Testing db.get_review_from_id() review1 = db.entity_db_dict['all'].get_review_from_id(0) self.assertEqual(review1.iloc[[0]].author.values, 'guest1') review2 = db.entity_db_dict['all'].get_review_from_id('1-2-1-0-0') self.assertEqual(review2.iloc[[0]].author.values, 'guest2') self.assertEqual(review2.iloc[[1]].author.values, 'guest3') self.assertEqual(review2.iloc[[2]].author.values, 'guest8') review3 = db.entity_db_dict['all'].get_review_from_id('1-2-1-0') self.assertEqual(review3.iloc[[0]].author.values, 'guest2') self.assertEqual(review3.iloc[[1]].author.values, 'guest3') self.assertEqual(review3.iloc[[2]].author.values, 'guest8') review4 = db.entity_db_dict['all'].get_review_from_id('1-2-1') self.assertEqual(review4.iloc[[0]].author.values, 'guest2') self.assertEqual(review4.iloc[[1]].author.values, 'guest3') self.assertEqual(review4.iloc[[2]].author.values, 'guest8') review5 = db.entity_db_dict['all'].get_review_from_id('1-2') self.assertEqual(review5.iloc[[0]].author.values, 'guest2') self.assertEqual(review5.iloc[[1]].author.values, 'guest3') self.assertEqual(review5.iloc[[2]].author.values, 'guest8') review6 = db.entity_db_dict['all'].get_review_from_id('1') self.assertEqual(review6.iloc[[0]].author.values, 'guest2') self.assertEqual(review6.iloc[[1]].author.values, 'guest3') self.assertEqual(review6.iloc[[2]].author.values, 'guest8') review7 = db.entity_db_dict['all'].get_review_from_id('4') self.assertEqual(review7.iloc[[0]].author.values, 'guest1') self.assertEqual(review7.iloc[[1]].author.values, 'guest6') self.assertEqual(review7.iloc[[2]].author.values, 'guest7') self.assertEqual(review7.iloc[[3]].author.values, 'guest9') #Test access using "all" code for all reviews alltest = db.entity_db_dict['all'].decode_id("all") self.assertEqual(alltest, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
log = logging.getLogger(__name__) log.setLevel('INFO') CONFIG = json.load(open("./../config.json")) data_folder = os.path.join(os.environ['DATA_DIR'], CONFIG['dataset']) schema = json.load(open(os.path.join(data_folder, 'schema.json')))['schema'] app = Flask(__name__, static_folder = './react-app/build/static', template_folder = './react-app/build') app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0 app.config['TEMPLATES_AUTO_RELOAD'] = True all_centroids_df = pd.read_csv(os.path.join(data_folder, 'centroids.csv')) log.info('centroids loaded') all_clusters_df = pd.read_csv(os.path.join(data_folder, 'clusters.csv')) log.info('clusters loaded') db_all = ReviewDB(all_clusters_df, all_centroids_df) working_df = None tfidf_model = TfidfModel.TFIDFModel(db_all) tfidf_model_2g = TfidfModel.TFIDFModel(db_all, 2) hotel_attr_path = lambda biz_id: os.path.join(data_folder, f'hotel-clusters/{biz_id}/attr.csv') hotel_centroids_path = lambda biz_id: os.path.join(data_folder, f'hotel-clusters/{biz_id}/centroids.csv') histogram_comparison_utils = HistogramComparison() # [Xiong] setups for CORS access. I do this because I test the frontend on # localhost:3000, while the server runs on localhost:5000. Eventually the CORS # setup will make it possible for data server and front-end hosting server # running on different machines --- which may not be necessary though @app.after_request
def test_TFIDF_init(self): db = ReviewDB('tests/test_data/') tfidf = TFIDFModel(db.entity_db_dict['all']) self.assertFalse(tfidf is None)
def test_nlplength_init(self): db = ReviewDB('tests/test_data/') nlp = NLPLengths(db.entity_db_dict['all']) self.assertFalse(nlp is None)
def test_tfidf_bigram(self): db = ReviewDB('tests/test_data/') tfidf = TFIDFModel(db.entity_db_dict['all'], ngramsize=2) tfidf_zero = tfidf.tfidf_score(0, ["wharf rooms"]) # print(tfidf_zero) self.assertTrue(("wharf rooms") in tfidf_zero.keys())