def test_frequency_cutoff(self): keywords4 = extract_keywords(self.analysis, self.reference, 1000, 10000, freq_cut_off=30) self.assertEqual(len(keywords4), 2)
def test_limit_rows(self): keywords2 = extract_keywords(self.analysis, self.reference, 1000, 10000, limit_rows=2) self.assertEqual(len(keywords2), 2)
def test_p_value(self): keywords3 = extract_keywords(self.analysis, self.reference, 1000, 10000, p_value=0.0001) # there are only 4 keywords self.assertEqual(len(keywords3), 4)
def test_wordlist_merging(self): keywords = extract_keywords(self.analysis, self.reference, 1000, 10000) # the default p_value is 0.0001 self.assertEqual(len(keywords), 4) self.assertIn('five', keywords.Type.tolist()) self.assertNotIn('SIX', keywords.Type.tolist()) self.assertEqual(keywords.Count_analysis[0], 540) # five does not occur in ref self.assertEqual(keywords.Count_ref[3], 0)
def test_round_values(self): keywords6 = extract_keywords(self.analysis, self.reference, 1000, 10000, round_values=False) # value = 1195.463979 value = keywords6.loc[0, 'LL'] nr_of_decimals = len(str(value).split('.')[1]) self.assertNotEqual(nr_of_decimals, 2)
def test_exclude_underused(self): ''' Expected work: Type Count_analysis Total_analysis Count_ref Total_ref \ 0 one 540 1000 540 100 2 four 431 1000 431 100 1 three 29 1000 29 100 Expected_count_analysis Expected_count_ref LL Use p 0 981.82 98.18 1195.46 - p < 0.0001 2 783.64 78.36 954.16 - p < 0.0001 1 52.73 5.27 64.20 - p < 0.0001 ''' keywords5 = extract_keywords(self.analysis, self.reference, 1000, 100, p_value=0.05, exclude_underused=False) self.assertEqual(len(keywords5), 3)
def build_keyword_list( cluster_length, subset_analysis, subcorpora_analysis, subset_reference, subcorpora_reference, p_value, limit_rows=3000, ): ''' Helper function to enable the caching of keywords. It returns records because dataframes cannot be cached. ''' index_name_analysis = construct_index_name(subset_analysis, cluster_length) wordlist_analysis = Cheshire3WordList() wordlist_analysis.build_wordlist(index_name_analysis, subcorpora_analysis) # collecting the total needs to precede renaming wordlist_analysis total_analysis = wordlist_analysis.total wordlist_analysis = wordlist_analysis.wordlist index_name_reference = construct_index_name(subset_reference, cluster_length) wordlist_reference = Cheshire3WordList() wordlist_reference.build_wordlist(index_name_reference, subcorpora_reference) # collecting the total needs to precede renaming wordlist_reference total_reference = wordlist_reference.total wordlist_reference = wordlist_reference.wordlist keywords = extract_keywords(wordlist_analysis, wordlist_reference, wordlist_analysis.Count.sum(), wordlist_reference.Count.sum(), limit_rows=limit_rows, p_value=p_value) return keywords.to_records(), total_analysis, total_reference