示例#1
0
 def test_get_adjacent_words_empty_inputs(self):
     """
     Checks that function can handle empty argument inputs
     """
     expected = []
     actual = get_adjacent_words([], 'happy', 2, 3)
     self.assertEqual(expected, actual)
     actual = get_adjacent_words(['happy'], '', 2, 3)
     self.assertEqual(expected, actual)
示例#2
0
 def test_get_adjacent_words_bad_number_inputs(self):
     """
     Checks that function can handle incorrect number inputs
     """
     expected = []
     actual = get_adjacent_words(['happy', 'man'], 'happy', -1, 0)
     self.assertEqual(expected, actual)
     actual = get_adjacent_words(['happy', 'man'], 'man', -1, 0)
     self.assertEqual(expected, actual)
     expected = [['man']]
     actual = get_adjacent_words(['happy', 'man'], 'happy', 0, 1)
     self.assertEqual(expected, actual)
 def test_get_adjacent_words_bad_inputs(self):
     """
     Checks that function can handle incorrect inputs
     """
     bad_inputs = [[], {}, 'string', (), None, 9.34, True, [None]]
     expected = []
     for bad_input in bad_inputs:
         actual_1 = get_adjacent_words(['happy', 'man', 'went'], 'man', bad_input, bad_input)
         actual_2 = get_adjacent_words(bad_input, 'happy', 2, 3)
         actual_3 = get_adjacent_words(['happy', 'man', 'went'], bad_input, 1, 2)
         self.assertEqual(expected, actual_1)
         self.assertEqual(expected, actual_2)
         self.assertEqual(expected, actual_3)
示例#4
0
 def test_get_adjacent_word_big_left_number_input(self):
     """
     Checks if function can handle great left range numbers,
     that exceed the number of given tokens
     """
     expected = [['one']]
     actual = get_adjacent_words(['one', 'happy', 'man'], 'happy', 1000, 0)
     self.assertEqual(expected, actual)
示例#5
0
 def test_big_text_get_adjacent_words_term(self):
     """
     Checks if adjacent words for a given term can be found properly
     """
     text = read_from_file('lab_1/data.txt')
     tokens = tokenize(text)
     expected = [['although', 'products']]
     actual = get_adjacent_words(tokens, 'tex', 4, 31)
     self.assertEqual(expected, actual)
示例#6
0
    def test_get_adjacent_words_several_contexts_big_text(self):
        """
        Checks if adjacent words for a given term can be found in real text properly
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [['epithelial', 'channels'], ['means', 'aluminate'],
                    ['by', 'bicarbonate'], ['the', 'salt']]
        actual = get_adjacent_words(tokens, 'sodium', 1, 1)
        self.assertEqual(expected, actual)
    def test_get_adjacent_words_ideal(self):
        """
        Ideal get adjacent words scenario
        """
        tokens = ['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy',
                  'the', 'dog', 'is', 'happy', 'but', 'the', 'cat', 'is', 'sad']
        word = 'happy'
        left_n = 2
        right_n = 3

        expected = [['man', 'is'], ['dog', 'cat']]
        actual = get_adjacent_words(tokens, word, left_n, right_n)
        self.assertEqual(expected, actual)
示例#8
0
    print('\n-----------------------------\n')

    frequencies = main.calculate_frequencies(
        tokens)  # old: 116 sec, new: ~81 sec
    print('frequency for the first word:', frequencies[tokens[0]])
    print('\n-----------------------------\n')

    top_10 = main.get_top_n_words(frequencies, 10)
    print('top 10 words:', top_10)
    print('\n-----------------------------\n')

    concordance_cat = main.get_concordance(tokens, 'cat', 2, 3)
    print('concordance for "cat", left = 2, right = 3:', concordance_cat[:5])
    print('\n-----------------------------\n')

    adjacent_words_cat = main.get_adjacent_words(tokens, 'cat', 2, 3)
    print('adjacent words for "cat" left = 2, right = 3:',
          adjacent_words_cat[:5])
    print('\n-----------------------------\n')

    sorted_concordance_cat = main.sort_concordance(tokens, 'cat', 2, 3, True)
    print('sorted concordance for "cat" left = 2, right = 3:',
          sorted_concordance_cat[:5])
    print('\n-----------------------------\n')

    #  main.write_to_file('report.txt', sorted_concordance_cat)

    RESULT = sorted_concordance_cat

    tokenized_data = main.tokenize(data)
    clean_data = main.remove_stop_words(tokenized_data, stop_words)
示例#9
0
if __name__ == '__main__':
    #  use data.txt file to test your program
    current_dir = os.path.dirname(os.path.abspath(__file__))
    data = main.read_from_file(os.path.join(current_dir, 'data.txt'))
    stop_words = main.read_from_file(
        os.path.join(current_dir, 'stop_words.txt')).split('\n')

    #  here goes your logic: calling methods from concordance.py
    tokenized_data = main.tokenize(data)
    clean_data = main.remove_stop_words(tokenized_data, stop_words)

    top_n = main.get_top_n_words(main.calculate_frequencies(clean_data), 13)
    key_word = top_n[-1]
    print(f'13th popular word: {key_word}. Let`s use if for further functions')

    closest_words = main.get_adjacent_words(clean_data, key_word, 3, 2)
    if len(closest_words) > 0:
        print(
            f"\nThird words from the left and second words from the right for "
            f"the word '{key_word}' (first 5 cases) are")
        for adjacent_words in closest_words[:5]:
            print('\t', adjacent_words)

    concordances = main.get_concordance(clean_data, key_word, 2, 2)
    if len(concordances) > 0:
        print(
            f"\nThe first three concordances (with 2 word on the left and 2 on the right)"
            f"for the word '{key_word}' are")
        for context in concordances[:3]:
            print('\t', context)
if __name__ == '__main__':
    current_dir = os.path.dirname(os.path.abspath(__file__))
    data = main.read_from_file(os.path.join(current_dir, 'data.txt'))
    stop_words = main.read_from_file(
        os.path.join(current_dir, 'stop_words.txt'))

    tokens = main.tokenize(data)
    print(f'Raw text: {data[:5]}')
    print(f'Tokenized text: {tokens[:5]}')

    tokens = main.remove_stop_words(tokens, stop_words)
    print(f'Text without stop-words: {tokens[:5]}')

    frequencies = main.calculate_frequencies(tokens[:5000])
    print(f'Frequencies: {frequencies[tokens[0]]}')

    word = 'dog'
    concordance = main.get_concordance(tokens, word, 2, 0)
    print(f'The concordance for {word}: {concordance[:5]}')

    adjacent = main.get_adjacent_words(tokens, 'dog', 2, 0)
    print(f'Adjacent words: {adjacent[:5]}')

    sorted_concordance = main.sort_concordance(tokens, 'dog', 2, 0, True)
    print(f'Sorted concordance: {sorted_concordance[:5]}')

    main.write_to_file('', sorted_concordance)

    RESULT = sorted_concordance
    assert RESULT, 'Concordance not working'