def test_nlplength_funcs(self): db = ReviewDB.load(cluster_file='tests/testing_db.csv') nlp = NLPLengths(db) #Test empty set empty1 = nlp.word_token_review_length_counter([]) self.assertEqual(empty1, (Counter(), 0, 0, 0, 0)) #Test word_token_review_length_counter word_result_zero = nlp.word_token_review_length_counter(0) print(word_result_zero) self.assertEqual(word_result_zero, (Counter({"12": 1}), 12.0, 12, (12, 1), 0.0)) word_result_cluster = nlp.word_token_review_length_counter('1-2-1-0-0') print(word_result_cluster) self.assertEqual(word_result_cluster, (Counter({"22": 1, "7": 1, "6": 1}), 11.666666666666666, 7, (6, 1), 8.962886439832502)) #Test sent_token_review_length_counter sent_result_zero = nlp.sent_token_review_length_counter(0) print(sent_result_zero) self.assertEqual(sent_result_zero, (Counter({"1": 1}), 1.0, 1, (1, 1), 0.0)) sent_result_cluster = nlp.sent_token_review_length_counter('1-2-1-0-0') print(sent_result_cluster) self.assertEqual(sent_result_cluster, (Counter({"1": 2, '3':1}), 1.6666666666666667, 1, (1, 2), 1.1547005383792515)) #Test char_review_length_counter char_result_zero = nlp.char_review_length_counter(0) print(char_result_zero) self.assertEqual(char_result_zero, (Counter({"53": 1}), 53.0, 53, (53,1), 0.0)) char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0') print(char_result_cluster) self.assertEqual(char_result_cluster, (Counter({"101": 1, "31": 1, "30": 1}), 54.0, 31, (30, 1), 40.70626487409524)) #Test Counter behavior when querying using a value not in the keys self.assertEqual(sent_result_cluster[0]['0'], 0)
def test_density_estimator(self): db = ReviewDB('tests/test_data/') nlp = NLPLengths(db.entity_db_dict['all']) histogram_comparison = HistogramComparison() char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0') histogram = char_result_cluster[0] density_estimate = histogram_comparison.density_estimator(histogram) self.assertEqual(sum(density_estimate.values()), 1.0)
def test_density_estimator(self): db = ReviewDB.load(cluster_file='tests/testing_db.csv') nlp = NLPLengths(db) histogram_comparison = HistogramComparison() char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0') histogram = char_result_cluster[0] density_estimate = histogram_comparison.density_estimator(histogram) self.assertEqual(sum(density_estimate.values()), 1.0)
def test_sorensen(self): db = ReviewDB('tests/test_data/') nlp = NLPLengths(db.entity_db_dict['all']) histogram_comparison = HistogramComparison() histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0] compare_self = histogram_comparison.sorensen(histogram1, histogram1) self.assertEqual(compare_self, 0.0) trivial_histogram1 = Counter({"1": 1}) trivial_histogram2 = Counter({"1": 0}) compare_trivial = histogram_comparison.sorensen( trivial_histogram1, trivial_histogram2) self.assertEqual(compare_trivial, 1.0) more_complicated_histogram1 = Counter({"1": 1, "2": 2}) more_complicated_histogram2 = Counter({"2": 3, "3": 4}) compare_more_complicated = histogram_comparison.sorensen( more_complicated_histogram1, more_complicated_histogram2) self.assertLess((compare_more_complicated - 0.66667), .001)
def test_hellinger(self): db = ReviewDB('tests/test_data/') nlp = NLPLengths(db.entity_db_dict['all']) histogram_comparison = HistogramComparison() histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0] compare_self = histogram_comparison.hellinger(histogram1, histogram1) self.assertEqual(compare_self, 0.0) trivial_histogram1 = Counter({"1": 1}) trivial_histogram2 = Counter({"1": 0}) compare_trivial = histogram_comparison.hellinger( trivial_histogram1, trivial_histogram2) self.assertEqual(compare_trivial, 0.7071067811865475) more_complicated_histogram1 = Counter({"1": 1, "2": 2}) more_complicated_histogram2 = Counter({"2": 3, "3": 4}) compare_more_complicated = histogram_comparison.hellinger( more_complicated_histogram1, more_complicated_histogram2) self.assertLess((compare_more_complicated - 0.6822591268536838), .001)
def test_euclidean(self): db = ReviewDB.load(cluster_file='tests/testing_db.csv') nlp = NLPLengths(db) histogram_comparison = HistogramComparison() histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0] compare_self = histogram_comparison.euclidean(histogram1, histogram1) self.assertEqual(compare_self, 0.0) trivial_histogram1 = Counter({"1": 1}) trivial_histogram2 = Counter({"1": 0}) compare_trivial = histogram_comparison.euclidean( trivial_histogram1, trivial_histogram2) self.assertEqual(compare_trivial, 1.0) more_complicated_histogram1 = Counter({"1": 1, "2": 2}) more_complicated_histogram2 = Counter({"2": 3, "3": 4}) compare_more_complicated = histogram_comparison.euclidean( more_complicated_histogram1, more_complicated_histogram2) self.assertLess((compare_more_complicated - 4.24264), .001)