Exemplo n.º 1
0
    def __init__(self, table_name_key):
        DataProcessInterface.__init__(self, table_name_key)

        #self.output_filename = parameters.get('output_filename')
        #self.output_folder = parameters.get('output_folder')
        util = Util()

        self.input_folder = util.getInputFolder(table_name_key)

        self.doc_no = 1  # assumeing all docs are read form the begining to the end
        self.para_no = 1
        self.sentance_no = 1
        self.word_no = 1
        self.wordCounter = WordCounter()
        #self.bufferedWriter = BufferedWriter(self.output_folder,
        #                                     self.output_filename)

        self.bufferedWriter = BufferedWriter(
            util.getOutputFolder(table_name_key),
            '{}.json'.format(table_name_key))

        self.dict_ = {
            'documents': {},
            'paragraphs': {},
            'words': {},
            'word-documents': {}
        }

        self.current_doc_name = None
Exemplo n.º 2
0
    def testAppostrophesHandledOk(self):
        wc = WordCounter()
        basicString = "I'm a Quick brown fox jumped over 9 the lazy dog"
        wc.processString(basicString)
        self.assertEqual(11, wc.getWordCount())

        wl = wc.getWordList()
        self.assertTrue("I\'m" in wl.keys())
Exemplo n.º 3
0
 def testCountWords(self):
     """
     Function that tests the countWords function within the
     Word Counter class.
     """
     # Number of words in text4.txt
     num = 200
     tester = WordCounter("tests/text4.txt")
     self.assertEqual(tester.countWords(), num)
Exemplo n.º 4
0
    def testWordTally(self):
        wc = WordCounter()
        basicString = "I am bad bad bad at python, its amazing how quickly you can forget python"
        wc.processString(basicString)

        wordList = wc.getWordList()

        self.assertEqual(3, wordList['bad'])
        self.assertEqual(2, wordList['python'])
Exemplo n.º 5
0
    def testGetTopOccurences(self):
        """
        Function that tests the getTopOccurrences method
        from WordCounter class. Takes a test .txt file and
        checks it against pre-checked dict

        """
        object = WordCounter("tests/testFile.txt")
        trueValue = {
            "quick": 7,
            "jumps": 7,
            "brown": 5,
            "zebra": 3,
            "table": 3,
            "over": 4,
            "dog": 4,
            "layer": 5,
            "crazy": 3,
            "productive": 4
        }
        self.assertDictEqual(object.getTopOccurrences(), trueValue)
Exemplo n.º 6
0
 def test_get_total_words(self):
     obj = WordCounter()
     obj.text_content = self.content
     self.assertEqual(obj.get_total_words(), 10)
Exemplo n.º 7
0
    def test_initialize_word_count_has_default_content(self):
        obj = WordCounter()

        self.assertIn('Lady Gaga', obj.text_content)
Exemplo n.º 8
0
 def testNumbersCounted(self):
     wc = WordCounter()
     basicString = "Quick brown fox jumped over 9 the lazy dog"
     wc.processString(basicString)
     self.assertEqual(9, wc.getWordCount())
Exemplo n.º 9
0
def test_count_word_simple():
    """Count word in a short text"""
    text = TextBody("the white white whale")
    counter = WordCounter(text)
    assert_equal(counter.count_word("white"), 2)
Exemplo n.º 10
0
 def setUp(self):
     """Prepare before each test"""
     MOBYDICK_SUMMARY = open('mobydick_summary.txt').read()
     self.text = TextBody(MOBYDICK_SUMMARY)
     self.counter = WordCounter(self.text)
Exemplo n.º 11
0
def run():
    word_counter = WordCounter('kennedy.txt')
    word_counter.display()
Exemplo n.º 12
0
 def set_up(self, filename):
     self.f = open(filename)
     self.fr = self.f.read()
     self.wc = WordCounter(self.fr)
Exemplo n.º 13
0
 def testBasicSentence(self):
     wc = WordCounter()
     basicString = "Quick brown fox jumped over the lazy dog."
     wc.processString(basicString)
     self.assertEqual(8, wc.getWordCount())
Exemplo n.º 14
0
 def testHandlesTemperature(self):
     wc = WordCounter()
     basicString = "The temperature outside, is -9C"
     wc.processString(basicString)
     self.assertEqual(5, wc.getWordCount())
Exemplo n.º 15
0
 def testHandlesCommasOk(self):
     wc = WordCounter()
     basicString = "I'm a Quick brown, fox jumped over 9 the lazy dog"
     wc.processString(basicString)
     self.assertEqual(11, wc.getWordCount())
Exemplo n.º 16
0
 def test_get_most_common_words(self):
     obj = WordCounter()
     obj.text_content = self.content
     self.assertEqual(obj.get_most_common_words(2), [('Four', 4),
                                                     ('Three', 3)])
Exemplo n.º 17
0
import re
from word_counter import WordCounter

# pattern for splitting words
PATTERN = "[\d\W\s_]+"

wc = WordCounter()

with open('README.md', 'r') as file:
    for line in file:
        for word in re.split(PATTERN, line.strip()):
            if word:
                wc.add_word(word)
               
# display the word counts
for word in sorted(wc):
    print(f"{wc[word]:3}  {word}")
Exemplo n.º 18
0

if __name__ == '__main__':
	started = datetime.now()

	keywords = ["Familie"]  # erstelle eine liste mit keywords
	analyzer = Analyzer(keywords).start()  # erstelle eine Analyzer-Instanz und starte diese

	ended = datetime.now()
	elapsed = ended - started
	print("Analyzer time: {}".format(elapsed))

	if input("WordCount errechnen? (y/n) - ").lower() == "y":
		# wenn vom User gewollt, wird die Wortanzahl berechnet
		ended = None

		# erstlle WordCounter-Instanz und starte diese
		counter = WordCounter()
		counter.start()

		# erstlle WeeklyCounter-Instanz
		weekly = WeeklyCounter(counter)
		weekly.start()

	if not ended: ended = datetime.now()

	elapsed = ended - started
	print("Time started: {}".format(started))
	print("Time ended: {}".format(ended))
	print("Total time running: {}".format(elapsed))
Exemplo n.º 19
0
 def setUp(self):
     conf = SparkConf().setAppName("appTest").setMaster("local[*]")
     self.sc = SparkContext(conf=conf)
     self.counter = WordCounter()
Exemplo n.º 20
0
 def test_count_word_simple(self):
     """Count a single word"""
     counter = WordCounter(MockText)
     self.assertEqual(counter.count_word("white"), 2)
Exemplo n.º 21
0
from word_counter import WordCounter
from ui import ui

if __name__ == "__main__":
    _ui = ui()
    _ui.handle_input()
    counter = WordCounter(_ui.file_name)
    success = counter.run()
    if success:
        _ui.display_word_count(counter)
        _ui.display_exit_message()
    else:
        _ui.display_error_message()
        exit()
#!/usr/bin/env python
#
# example of a test generator
#

from nose.tools import assert_equal
from word_counter import TextBody, WordCounter

MOBYDICK_SUMMARY = open('mobydick_summary.txt').read()
TEXT = TextBody(MOBYDICK_SUMMARY)
COUNTER = WordCounter(TEXT)

WORD_PAIRS = [('months', 1), ('whale', 5), ('captain', 4), ('white', 2),
              ('harpoon', 1), ('Ahab', 1)]


def check_word(word, number):
    assert_equal(COUNTER.count_word(word), number)


def test_word_pairs():
    # Tests a series of example words
    # creates one test for each word
    # --- no docstring so that parameters are visible ---
    for word, number in WORD_PAIRS:
        yield check_word, word, number


# nose does
for x, y, z in test_word_pairs:
    if x(y, z):
Exemplo n.º 23
0
Arquivo: main.py Projeto: kail/csep517
def main():
    logging.info("Starting...")

    training_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.train.json')
    dev_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.dev.json')
    test_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.test.json')

    # First, count the words!
    counter = WordCounter()
    for parsed_sentence in training_parser.get_tokenized_sentences():
        if parsed_sentence:
            for i in range(1, len(parsed_sentence) - 1):
                counter.add_word(parsed_sentence[i][0])

    # Finalize counter and separate high frequency from low frequency
    counter.finalize()

    # Initialize the models
    bigram = BigramHMM()
    trigram = TrigramHMM()

    for parsed_sentence in training_parser.get_tokenized_sentences():
        if parsed_sentence:
            # Convert the low frequency words to classes
            counter.classify_sentence(parsed_sentence)

            bigram.add_sentence(parsed_sentence)
            trigram.add_sentence(parsed_sentence)

    # Models have been initialized at this point, finalize the distributions
    #bigram.finalize()
    trigram.finalize()

    # PICK THE PARSER HERE
    parser = dev_parser

    # Iterate over data and try to predict
    num_correct_bigram = 0
    num_correct_trigram = 0
    total_words = 0
    for parsed_sentence in parser.get_tokenized_sentences():
        if parsed_sentence:
            original_sentence = copy.deepcopy(parsed_sentence)

            # Convert the low frequency words to classes
            counter.classify_sentence(parsed_sentence)

            # Bigram lattice
            #lattice = Lattice(bigram, parsed_sentence)

            # Trigram lattice
            tri_lattice = TrigramLattice(trigram, parsed_sentence)

            # Calculate best POS using viterbi
            #pos_list_bigram = lattice.get_pos()
            pos_list_trigram = tri_lattice.get_pos()

            # Determine how many were correct
            #num_correct_bigram += get_num_correct(parsed_sentence, pos_list_bigram, lattice)
            num_correct_trigram += get_num_correct(parsed_sentence,
                                                   pos_list_trigram,
                                                   tri_lattice,
                                                   original_sentence, counter)

            # Remove the START and STOP chars
            total_words += (len(parsed_sentence) - 2)

            print("Accuracy: %s" % (num_correct_trigram / total_words))
        else:
            print('ERROR! Couldnt parse sentence')

    print("Bigram HMM Accuracy: %s/%s - %s" %
          (num_correct_bigram, total_words,
           (num_correct_bigram / total_words)))
    print("Trigram HMM Accuracy: %s/%s - %s" %
          (num_correct_trigram, total_words,
           (num_correct_trigram / total_words)))
Exemplo n.º 24
0
 print("\nPuede escoger una de las siguientes opciones:")
 print("(1) Relación de las palabras utilizadas en el texto, así como las veces que aparecen.")
 print(f"(2) Las {words_most_used} palabras más utilizadas en el texto, asi como las veces que aparecen.")
 try:
     option = int(input("Escribe el número de una de la opción seleccionada: "))
 except ValueError:
     print(f"\nLo sentimos, la opción seleccionada no es válida.")
     another_try = input("¿Gusta volver a seleccionar una opción?  S=si / N=no: ")
     another_try = another_try.lower()
     if another_try == "s":
         continue
     else:
         print("\nMuchas gracias por haber usado esta aplicación, esperamos que vuelva pronto.")
         exit()
 if option == 1:
     word_counter = WordCounter(words_in_text)
     counted_words = word_counter.count_words()
     print(f"\nLa relación de palabras que aprecen {text.file_name}, con la cantidad de veces que aparece cada una de ellas, es la siguiente:")
     for key, value in counted_words.items():
         if value == 1:
             print(f"{key} aparece {value} vez.")
         else:
             print(f"{key} aparece {value} veces.")
     is_done = True
 elif option == 2:
     word_counter_sorter = CountedWordsSorter(words_in_text, words_to_ignore)
     counted_words = word_counter_sorter.count_words()
     sorted_counted_words = word_counter_sorter.sort_counted_words(counted_words)
     counter = 0
     if words_most_used == 1:
         print(f"\nLa palabra más utilizada en {text.file_name} es la siguiente:")
Exemplo n.º 25
0
def test_count_word_complex():
    """Count word in a longer text"""
    text = TextBody(MOBYDICK_SUMMARY)
    counter = WordCounter(text)
    assert_equal(counter.count_word("white"), 2)
Exemplo n.º 26
0
 def read(self, filePlace, type):
     movies = self._sc.textFile(filePlace)
     counter = WordCounter()
     if type is 'Years':
         counter = YearsCounter()
     return counter.getMaxValues(movies)