def test_get_concordance_empty_inputs(self): """ Checks that function can handle empty argument inputs """ expected = [] actual = get_concordance([], 'happy', 2, 3) self.assertEqual(expected, actual) actual = get_concordance(['happy'], '', 2, 3) self.assertEqual(expected, actual) expected = [] actual = get_concordance(['happy'], 'happy', 0, 0) self.assertEqual(expected, actual)
def test_get_concordance_bad_inputs(self): """ Checks that function can handle incorrect inputs """ bad_inputs = [[], {}, 'string', (), None, 9.34, True, [None]] expected = [] for bad_input in bad_inputs: actual_1 = get_concordance(['happy', 'man', 'went'], 'man', bad_input, bad_input) actual_2 = get_concordance(bad_input, 'happy', 2, 3) actual_3 = get_concordance(['happy', 'man', 'went'], bad_input, 1, 2) self.assertEqual(expected, actual_1) self.assertEqual(expected, actual_2) self.assertEqual(expected, actual_3)
def test_get_concordance_bad_number_inputs(self): """ Checks that function can handle incorrect number inputs """ expected = [] actual = get_concordance(['happy', 'man'], 'happy', -1, 0) self.assertEqual(expected, actual) expected = [['happy', 'man']] actual = get_concordance(['happy', 'man'], 'happy', 0, 1) self.assertEqual(expected, actual) expected = [] actual = get_concordance(['happy', 'man'], 'man', -1, 0) self.assertEqual(expected, actual)
def test_get_concordance_big_left_number_input(self): """ Checks if function can handle great left range numbers, that exceed the number of given tokens """ expected = [['one', 'happy']] actual = get_concordance(['one', 'happy', 'man'], 'happy', 1000, 0) self.assertEqual(expected, actual)
def test_get_concordance_several_contexts(self): """ Checks that a concordance list can be created for several contexts """ expected = [['man', 'is', 'happy', 'the', 'dog', 'is'], ['dog', 'is', 'happy', 'but', 'the', 'cat']] actual = get_concordance(['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy', 'the', 'dog', 'is', 'happy', 'but', 'the', 'cat', 'is', 'sad'], 'happy', 2, 3) self.assertEqual(expected, actual)
def test_get_concordance_ideal(self): """ Ideal get concordance scenario """ expected = [['man', 'is', 'happy', 'the', 'dog', 'is']] actual = get_concordance(['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy', 'the', 'dog', 'is', 'glad', 'but', 'the', 'cat', 'is', 'sad'], 'happy', 2, 3) self.assertEqual(expected, actual)
def test_get_concordance_several_contexts_big_text(self): """ Checks if contexts for a given term can be found in real text properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['epithelial', 'sodium', 'channels'], ['means', 'sodium', 'aluminate'], ['by', 'sodium', 'bicarbonate'], ['the', 'sodium', 'salt']] actual = get_concordance(tokens, 'sodium', 1, 1) self.assertEqual(expected, actual)
def test_big_text_get_concordance_term(self): """ Checks if a context for a given term can be found properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['although', 'less', 'compact', 'than', 'tex', 'the', 'xml', 'structuring', 'promises', 'to', 'make', 'it', 'widely', 'usable', 'and', 'allows', 'for', 'instant', 'display', 'in', 'applications', 'such', 'as', 'web', 'browsers', 'and', 'facilitates', 'an', 'interpretation', 'of', 'its', 'meaning', 'in', 'mathematical', 'software', 'products']] actual = get_concordance(tokens, 'tex', 4, 31) self.assertEqual(expected, actual)
tokens = main.remove_stop_words(tokens, stop_words) # old: 34 sec, new - 3.4 sec print('tokens without stop words:', tokens[:10]) print('\n-----------------------------\n') frequencies = main.calculate_frequencies( tokens) # old: 116 sec, new: ~81 sec print('frequency for the first word:', frequencies[tokens[0]]) print('\n-----------------------------\n') top_10 = main.get_top_n_words(frequencies, 10) print('top 10 words:', top_10) print('\n-----------------------------\n') concordance_cat = main.get_concordance(tokens, 'cat', 2, 3) print('concordance for "cat", left = 2, right = 3:', concordance_cat[:5]) print('\n-----------------------------\n') adjacent_words_cat = main.get_adjacent_words(tokens, 'cat', 2, 3) print('adjacent words for "cat" left = 2, right = 3:', adjacent_words_cat[:5]) print('\n-----------------------------\n') sorted_concordance_cat = main.sort_concordance(tokens, 'cat', 2, 3, True) print('sorted concordance for "cat" left = 2, right = 3:', sorted_concordance_cat[:5]) print('\n-----------------------------\n') # main.write_to_file('report.txt', sorted_concordance_cat)
tokenized_data = main.tokenize(data) clean_data = main.remove_stop_words(tokenized_data, stop_words) top_n = main.get_top_n_words(main.calculate_frequencies(clean_data), 13) key_word = top_n[-1] print(f'13th popular word: {key_word}. Let`s use if for further functions') closest_words = main.get_adjacent_words(clean_data, key_word, 3, 2) if len(closest_words) > 0: print( f"\nThird words from the left and second words from the right for " f"the word '{key_word}' (first 5 cases) are") for adjacent_words in closest_words[:5]: print('\t', adjacent_words) concordances = main.get_concordance(clean_data, key_word, 2, 2) if len(concordances) > 0: print( f"\nThe first three concordances (with 2 word on the left and 2 on the right)" f"for the word '{key_word}' are") for context in concordances[:3]: print('\t', context) sorted_concordance_left = main.sort_concordance(clean_data, key_word, 2, 2, True) if len(sorted_concordance_left) > 0: print('\nConcordance sorted by the first left word (first 5 cases):') for concordance in sorted_concordance_left[:5]: print('\t', concordance) sorted_concordance_right = main.sort_concordance(clean_data, key_word, 2,
if __name__ == '__main__': current_dir = os.path.dirname(os.path.abspath(__file__)) data = main.read_from_file(os.path.join(current_dir, 'data.txt')) stop_words = main.read_from_file( os.path.join(current_dir, 'stop_words.txt')) tokens = main.tokenize(data) print(f'Raw text: {data[:5]}') print(f'Tokenized text: {tokens[:5]}') tokens = main.remove_stop_words(tokens, stop_words) print(f'Text without stop-words: {tokens[:5]}') frequencies = main.calculate_frequencies(tokens[:5000]) print(f'Frequencies: {frequencies[tokens[0]]}') word = 'dog' concordance = main.get_concordance(tokens, word, 2, 0) print(f'The concordance for {word}: {concordance[:5]}') adjacent = main.get_adjacent_words(tokens, 'dog', 2, 0) print(f'Adjacent words: {adjacent[:5]}') sorted_concordance = main.sort_concordance(tokens, 'dog', 2, 0, True) print(f'Sorted concordance: {sorted_concordance[:5]}') main.write_to_file('', sorted_concordance) RESULT = sorted_concordance assert RESULT, 'Concordance not working'