class TestWordCounter(unittest.TestCase): def setUp(self): conf = SparkConf().setAppName("appTest").setMaster("local[*]") self.sc = SparkContext(conf=conf) self.counter = WordCounter() def tearDown(self): self.sc.stop() def test_when_exist_one_movie_and_counter(self): movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy", "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"] result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']), ('Toy', ['::Toy Story Toy (1995)::'])) movies = self.sc.parallelize(movieList) self.assertEqual(self.counter.getMaxValues(movies),result) def test_when_exist_one_movie_and_counter_moreMovies(self): movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy", "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy", "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"] result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::'])) movies = self.sc.parallelize(movieList) self.assertEqual(self.counter.getMaxValues(movies),result)
class TestWordCounter(unittest.TestCase): def setUp(self): conf = SparkConf().setAppName("appTest").setMaster("local[*]") self.sc = SparkContext(conf=conf) self.counter = WordCounter() def tearDown(self): self.sc.stop() def test_when_exist_one_movie_and_counter(self): movieList = [ "1993::Toy Story Toy (1995)::Animation|Children's|Comedy", "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy" ] result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']), ('Toy', ['::Toy Story Toy (1995)::'])) movies = self.sc.parallelize(movieList) self.assertEqual(self.counter.getMaxValues(movies), result) def test_when_exist_one_movie_and_counter_moreMovies(self): movieList = [ "1993::Toy Story Toy (1995)::Animation|Children's|Comedy", "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy", "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy" ] result = (('ToyA', [ '::ToyA StoryB ToyA (1995)::', '::ToyA StoryA ToyA (1995)::' ])) movies = self.sc.parallelize(movieList) self.assertEqual(self.counter.getMaxValues(movies), result)
def testWordTally(self): wc = WordCounter() basicString = "I am bad bad bad at python, its amazing how quickly you can forget python" wc.processString(basicString) wordList = wc.getWordList() self.assertEqual(3, wordList['bad']) self.assertEqual(2, wordList['python'])
def testCountWords(self): """ Function that tests the countWords function within the Word Counter class. """ # Number of words in text4.txt num = 200 tester = WordCounter("tests/text4.txt") self.assertEqual(tester.countWords(), num)
def __init__(self, table_name_key): DataProcessInterface.__init__(self, table_name_key) #self.output_filename = parameters.get('output_filename') #self.output_folder = parameters.get('output_folder') util = Util() self.input_folder = util.getInputFolder(table_name_key) self.doc_no = 1 # assumeing all docs are read form the begining to the end self.para_no = 1 self.sentance_no = 1 self.word_no = 1 self.wordCounter = WordCounter() #self.bufferedWriter = BufferedWriter(self.output_folder, # self.output_filename) self.bufferedWriter = BufferedWriter( util.getOutputFolder(table_name_key), '{}.json'.format(table_name_key)) self.dict_ = { 'documents': {}, 'paragraphs': {}, 'words': {}, 'word-documents': {} } self.current_doc_name = None
class WordCounterFixtureTests(TestCase): def setUp(self): """Prepare before each test""" MOBYDICK_SUMMARY = open('mobydick_summary.txt').read() self.text = TextBody(MOBYDICK_SUMMARY) self.counter = WordCounter(self.text) def test_count_months(self): self.assertEqual(self.counter.count_word("months"), 1) def test_count_the(self): """Count word in a longer text""" self.assertEqual(self.counter.count_word("the"), 6) def tearDown(self): """Clean up after a test has passed or failed.""" pass
def testGetTopOccurences(self): """ Function that tests the getTopOccurrences method from WordCounter class. Takes a test .txt file and checks it against pre-checked dict """ object = WordCounter("tests/testFile.txt") trueValue = { "quick": 7, "jumps": 7, "brown": 5, "zebra": 3, "table": 3, "over": 4, "dog": 4, "layer": 5, "crazy": 3, "productive": 4 } self.assertDictEqual(object.getTopOccurrences(), trueValue)
def testAppostrophesHandledOk(self): wc = WordCounter() basicString = "I'm a Quick brown fox jumped over 9 the lazy dog" wc.processString(basicString) self.assertEqual(11, wc.getWordCount()) wl = wc.getWordList() self.assertTrue("I\'m" in wl.keys())
class WordCounterTest(unittest.TestCase): def set_up(self, filename): self.f = open(filename) self.fr = self.f.read() self.wc = WordCounter(self.fr) def test_empty(self): self.set_up("samples/empty.txt") self.assertEqual(self.wc.get_count(), 0) self.assertEqual(self.wc.get_most_frequent(), {}) def test_base(self): self.set_up("samples/test.txt") self.assertEqual(self.wc.get_count(), 9) self.assertEqual(len(self.wc.get_most_frequent().keys()), 8) def test_long(self): self.set_up("samples/long.txt") self.assertEqual(self.wc.get_count(), 188) self.assertEqual(len(self.wc.get_most_frequent().keys()), 10) def tearDown(self): self.wc = None self.f.close()
def test_when_input_is_one_word_return_count_for_one_word(): assert WordCounter.count("hello") == "hello, 1"
#!/usr/bin/env python # # example of a test generator # from nose.tools import assert_equal from word_counter import TextBody, WordCounter MOBYDICK_SUMMARY = open('mobydick_summary.txt').read() TEXT = TextBody(MOBYDICK_SUMMARY) COUNTER = WordCounter(TEXT) WORD_PAIRS = [('months', 1), ('whale', 5), ('captain', 4), ('white', 2), ('harpoon', 1), ('Ahab', 1)] def check_word(word, number): assert_equal(COUNTER.count_word(word), number) def test_word_pairs(): # Tests a series of example words # creates one test for each word # --- no docstring so that parameters are visible --- for word, number in WORD_PAIRS: yield check_word, word, number # nose does for x, y, z in test_word_pairs: if x(y, z):
def test_count_word_simple(): """Count word in a short text""" text = TextBody("the white white whale") counter = WordCounter(text) assert_equal(counter.count_word("white"), 2)
def testBasicSentence(self): wc = WordCounter() basicString = "Quick brown fox jumped over the lazy dog." wc.processString(basicString) self.assertEqual(8, wc.getWordCount())
def run(): word_counter = WordCounter('kennedy.txt') word_counter.display()
from word_counter import WordCounter from ui import ui if __name__ == "__main__": _ui = ui() _ui.handle_input() counter = WordCounter(_ui.file_name) success = counter.run() if success: _ui.display_word_count(counter) _ui.display_exit_message() else: _ui.display_error_message() exit()
def test_get_most_common_words(self): obj = WordCounter() obj.text_content = self.content self.assertEqual(obj.get_most_common_words(2), [('Four', 4), ('Three', 3)])
import re from word_counter import WordCounter # pattern for splitting words PATTERN = "[\d\W\s_]+" wc = WordCounter() with open('README.md', 'r') as file: for line in file: for word in re.split(PATTERN, line.strip()): if word: wc.add_word(word) # display the word counts for word in sorted(wc): print(f"{wc[word]:3} {word}")
def test_get_total_words(self): obj = WordCounter() obj.text_content = self.content self.assertEqual(obj.get_total_words(), 10)
def test_initialize_word_count_has_default_content(self): obj = WordCounter() self.assertIn('Lady Gaga', obj.text_content)
print("\nPuede escoger una de las siguientes opciones:") print("(1) Relación de las palabras utilizadas en el texto, así como las veces que aparecen.") print(f"(2) Las {words_most_used} palabras más utilizadas en el texto, asi como las veces que aparecen.") try: option = int(input("Escribe el número de una de la opción seleccionada: ")) except ValueError: print(f"\nLo sentimos, la opción seleccionada no es válida.") another_try = input("¿Gusta volver a seleccionar una opción? S=si / N=no: ") another_try = another_try.lower() if another_try == "s": continue else: print("\nMuchas gracias por haber usado esta aplicación, esperamos que vuelva pronto.") exit() if option == 1: word_counter = WordCounter(words_in_text) counted_words = word_counter.count_words() print(f"\nLa relación de palabras que aprecen {text.file_name}, con la cantidad de veces que aparece cada una de ellas, es la siguiente:") for key, value in counted_words.items(): if value == 1: print(f"{key} aparece {value} vez.") else: print(f"{key} aparece {value} veces.") is_done = True elif option == 2: word_counter_sorter = CountedWordsSorter(words_in_text, words_to_ignore) counted_words = word_counter_sorter.count_words() sorted_counted_words = word_counter_sorter.sort_counted_words(counted_words) counter = 0 if words_most_used == 1: print(f"\nLa palabra más utilizada en {text.file_name} es la siguiente:")
def test_when_input_is_two_words_return_count_for_each_word(): assert WordCounter.count("hello world") == "hello, 1\nworld, 1"
def test_count_word_simple(self): """Count a single word""" counter = WordCounter(MockText) self.assertEqual(counter.count_word("white"), 2)
def set_up(self, filename): self.f = open(filename) self.fr = self.f.read() self.wc = WordCounter(self.fr)
def setUp(self): """Prepare before each test""" MOBYDICK_SUMMARY = open('mobydick_summary.txt').read() self.text = TextBody(MOBYDICK_SUMMARY) self.counter = WordCounter(self.text)
if __name__ == '__main__': started = datetime.now() keywords = ["Familie"] # erstelle eine liste mit keywords analyzer = Analyzer(keywords).start() # erstelle eine Analyzer-Instanz und starte diese ended = datetime.now() elapsed = ended - started print("Analyzer time: {}".format(elapsed)) if input("WordCount errechnen? (y/n) - ").lower() == "y": # wenn vom User gewollt, wird die Wortanzahl berechnet ended = None # erstlle WordCounter-Instanz und starte diese counter = WordCounter() counter.start() # erstlle WeeklyCounter-Instanz weekly = WeeklyCounter(counter) weekly.start() if not ended: ended = datetime.now() elapsed = ended - started print("Time started: {}".format(started)) print("Time ended: {}".format(ended)) print("Total time running: {}".format(elapsed))
def test_when_input_has_a_word_that_repeats_return_correct_count_for_word_that_repeats( ): assert WordCounter.count('hello world hello') == "hello, 2\nworld, 1"
def main(): logging.info("Starting...") training_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.train.json') dev_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.dev.json') test_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.test.json') # First, count the words! counter = WordCounter() for parsed_sentence in training_parser.get_tokenized_sentences(): if parsed_sentence: for i in range(1, len(parsed_sentence) - 1): counter.add_word(parsed_sentence[i][0]) # Finalize counter and separate high frequency from low frequency counter.finalize() # Initialize the models bigram = BigramHMM() trigram = TrigramHMM() for parsed_sentence in training_parser.get_tokenized_sentences(): if parsed_sentence: # Convert the low frequency words to classes counter.classify_sentence(parsed_sentence) bigram.add_sentence(parsed_sentence) trigram.add_sentence(parsed_sentence) # Models have been initialized at this point, finalize the distributions #bigram.finalize() trigram.finalize() # PICK THE PARSER HERE parser = dev_parser # Iterate over data and try to predict num_correct_bigram = 0 num_correct_trigram = 0 total_words = 0 for parsed_sentence in parser.get_tokenized_sentences(): if parsed_sentence: original_sentence = copy.deepcopy(parsed_sentence) # Convert the low frequency words to classes counter.classify_sentence(parsed_sentence) # Bigram lattice #lattice = Lattice(bigram, parsed_sentence) # Trigram lattice tri_lattice = TrigramLattice(trigram, parsed_sentence) # Calculate best POS using viterbi #pos_list_bigram = lattice.get_pos() pos_list_trigram = tri_lattice.get_pos() # Determine how many were correct #num_correct_bigram += get_num_correct(parsed_sentence, pos_list_bigram, lattice) num_correct_trigram += get_num_correct(parsed_sentence, pos_list_trigram, tri_lattice, original_sentence, counter) # Remove the START and STOP chars total_words += (len(parsed_sentence) - 2) print("Accuracy: %s" % (num_correct_trigram / total_words)) else: print('ERROR! Couldnt parse sentence') print("Bigram HMM Accuracy: %s/%s - %s" % (num_correct_bigram, total_words, (num_correct_bigram / total_words))) print("Trigram HMM Accuracy: %s/%s - %s" % (num_correct_trigram, total_words, (num_correct_trigram / total_words)))
def test_when_input_has_comma_as_delimiter_return_count_for_words(): assert WordCounter.count('hello,kitty') == "hello, 1\nkitty, 1"
def test_count_word_complex(): """Count word in a longer text""" text = TextBody(MOBYDICK_SUMMARY) counter = WordCounter(text) assert_equal(counter.count_word("white"), 2)
def test_when_input_has_more_than_one_delimiter_return_count_for_words(): assert WordCounter.count( 'merry christmas,kitty') == "merry, 1\nchristmas, 1\nkitty, 1"
def read(self,filePlace,type): movies = self._sc.textFile(filePlace) counter = WordCounter() if type is 'Years' : counter = YearsCounter() return counter.getMaxValues(movies)
def setUp(self): conf = SparkConf().setAppName("appTest").setMaster("local[*]") self.sc = SparkContext(conf=conf) self.counter = WordCounter()