def main(rootDirectory, words, confidenceLevel, functionType):
    corpus = Corpus(rootDirectory, toLowercase=TO_LOWERCASE, filters=FILTERS)
    #sampler = Sampler(SAMPLE_SIZE, sampleLength=SAMPLE_LENGTH)
    sampler = Sampler(SAMPLE_SIZE,
                      sampleLengthPercentage=SAMPLE_LENGTH_PERCENTAGE)

    documentSamples = {}
    for documentTitle in corpus.documents:
        documentSample = sampler.sample(corpus.documents[documentTitle],
                                        usePercentage=True)
        documentSamples[documentTitle] = documentSample

    wordCounter = WordCounter(words, SAMPLE_SIZE)
    wordCounter.countOccurrences(documentSamples)

    dataLabels = sorted(list(wordCounter.occurrences.keys()))
    dataSets = []
    for dataLabel in dataLabels:
        #dataSet = wordCounter.occurrences[dataLabel]
        dataSet = wordCounter.occurrencesPerMillionWords[dataLabel]
        dataSets.append(dataSet)

    statisticsPlotter = StatisticsPlotter(dataLabels, dataSets, CONFIDENCE,
                                          words)
    statisticsPlotter.plotStatistics(functionType=functionType)
示例#2
0
    def test_counts_a_real_java_sourceline(self):
        sample = """
	private void addOrderEntryLibJars(Tag set) {
		for (String jarName : libJars())
			if (isJunit(jarName))
				addLibraryOrderEntry(set, "junit");
			else
				addLibraryOrderEntry(set, jarName.split(".jar")[0]);
	}"""
        counter = WordCounter()
        counter.count(sample)
        expected = [('jar', 4), ('set', 3), ('name', 3), ('add', 3)]
        self.assertEquals(expected, counter.mostUsedWords(4))
示例#3
0
    def test_counts_a_real_java_sourceline(self):
        sample = """
	private void addOrderEntryLibJars(Tag set) {
		for (String jarName : libJars())
			if (isJunit(jarName))
				addLibraryOrderEntry(set, "junit");
			else
				addLibraryOrderEntry(set, jarName.split(".jar")[0]);
	}"""
        counter = WordCounter()
        counter.count(sample)
        expected = [
                ('jar',4),
                ('set',3),
                ('name',3),
                ('add',3)
        ]
        self.assertEquals(expected, counter.mostUsedWords(4))
示例#4
0
def main():
    rootdir = 'C:/Users/pouya\python-projects\\truth-goggles\congressionalrecord\output'
    wordStems = {
    }  # dictionary of all the stemmed words said and the number of times they are said

    # creates dictionary of legislators to cross-reference for party and chamber
    legislators = PoliticiansDatabase()

    # if we want to re-parse/clean the congressional record, this builds the dictionary of speakers and speeches
    # builds dictionary where key is a speaker and value is an array of all of their speeches
    speakerDict = buildWriterFile(rootdir)

    numberOfSpeechesDems = 0  # stores number of speeches given by Dems
    numberOfSpeechesGop = 0  # stores number of speeches given by GOP

    words = WordCounter()

    for speaker in speakerDict:  # iterates through the speakers of all speeches

        speakerArr = speaker.split(
            ";"
        )  # list that has [name of speaker (+ "of " + their state), chamber]
        speakerName = speakerArr[0]  # stores speaker's name
        if speaker not in wordStems:
            wordStems[speaker] = list(
            )  # adds speaker to wordStems if not already in it
        speakerDictKey = speaker.split(".")[-1].lower()[
            1:]  # stores name of spekaer without prefix and their chamber

        numberOfSpeechesBoolean = True  # helper boolean to keep track of number of speeches we parse
        for tup in speakerDict[speaker]:
            speech = tup[1]  # stores speech given by speaker
            year = tup[0]  # stores year that speech was given

            # finds party of speaker and assigns "not found" if cannot find speaker
            speakerParty = legislators.getSpeakerParty(year, speakerDictKey)

            # adds the number of speeches given by the speaker to the appropriate party's speech count
            if numberOfSpeechesBoolean:
                if speakerParty == "democrat":
                    numberOfSpeechesDems += len(speakerDict[speaker])
                elif speakerParty == "republican":
                    numberOfSpeechesGop += len(speakerDict[speaker])
            numberOfSpeechesBoolean = False

            words.addSingleWords(speech, speakerParty)

    words.calculateFrequencies(minimumOccs=125)
    words.printTopFrequencies(howMany=1000, whichParty='gop')
    words.printTopFrequencies(howMany=1000, whichParty='dem')
def main(indir='wc_input',
         outdir='wc_output',
         word_count_report='wc_result.txt',
         running_median_report='med_result.txt',
         running_median_method='hashtracker'):

    # create output directory
    try:
        os.mkdir(outdir)
    except Exception:
        pass

    # initialize data structures
    wc = WordCounter(report_filename=os.path.join(outdir, word_count_report))
    rm = RunningMedian.factory(running_median_method, report_filename=os.path.join(outdir, running_median_report))

    for file in os.listdir(indir):
        if not file.endswith('.txt'):
            continue

        with open(os.path.join(indir, file), 'r') as f:
            for line in f:
                words = line.split()

                # count each word
                for w in words:
                    wc.update(w)

                # count number of words
                rm.update(len(words))
                rm.report()

    # report word count after everything is done
    wc.report()
示例#6
0
 def test_acronym_in_middle(self):
     counter = WordCounter()
     result = counter._uncamelCase('embedHTMLHere')
     self.assertEquals(['embed','HTML', 'Here'], list(result))
示例#7
0
 def test_joined_leadingUpper(self):
     counter = WordCounter()
     result = counter._uncamelCase('LeadingUpper')
     self.assertEquals(['Leading','Upper'], list(result))
示例#8
0
 def test_capitalized(self):
     counter = WordCounter()
     result = counter._uncamelCase('This')
     self.assertEquals(['This'], list(result))
示例#9
0
 def test_failFast(self):
     sample = "public Album"
     counter = WordCounter()
     counter.count(sample)
     self.assertEqual(1, counter.timesOccurred('public'))
示例#10
0
 def test_differentCase(self):
     counter = WordCounter()
     counter.count("Album album ALBUM")
     self.assertEqual(3, counter.timesOccurred('album'))
示例#11
0
 def test_leading_upper(self):
     counter = WordCounter()
     result = counter._uncamelCase('THIS')
     self.assertEquals(['THIS'], list(result))
示例#12
0
 def test_all_lower(self):
     counter = WordCounter()
     result = counter._uncamelCase('this')
     self.assertEquals(['this'], list(result))
示例#13
0
 def test_counts_words_despite_punctuation(self):
     counter = WordCounter()
     counter.count("Album album = new ALBUM()")
     self.assertEqual(3, counter.timesOccurred('album'))
示例#14
0
 def test_differentCase(self):
     counter = WordCounter()
     counter.count("Album album ALBUM")
     self.assertEqual(3, counter.timesOccurred('album'))
示例#15
0
from HelperMethods import *
from WordCounter import WordCounter

import random

# Create WordCounters with nonsense words and a random word count
wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)
示例#16
0
 def test_capitalized(self):
     counter = WordCounter()
     result = counter._uncamelCase('This')
     self.assertEquals(['This'], list(result))
import argparse
from InputFileValidator import InputFileValidator
from WordCounter import WordCounter

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='This is a Python version of grep function.')
    parser.add_argument('name_of_file',
                        type=str,
                        help='an name of the file to check')
    args = parser.parse_args()
    name_of_file = args.name_of_file
    word_searcher = InputFileValidator(name_of_file)
    if word_searcher.validate():
        To_count_all_word = WordCounter(word_searcher.get_list_of_files())
        To_count_all_word.returning_count_of_all_chars_words_lines_provided()
示例#18
0
 def test_failFast(self):
     sample = "public Album"
     counter = WordCounter()
     counter.count(sample)
     self.assertEqual(1, counter.timesOccurred('public'))
示例#19
0
 def test_counts_words_despite_punctuation(self):
     counter = WordCounter()
     counter.count("Album album = new ALBUM()")
     self.assertEqual(3, counter.timesOccurred('album'))
示例#20
0
 def test_joined_leadingLower(self):
     counter = WordCounter()
     result = counter._uncamelCase('thisTime')
     self.assertEquals(['this', 'Time'], list(result))
示例#21
0
 def test_all_lower(self):
     counter = WordCounter()
     result = counter._uncamelCase('this')
     self.assertEquals(['this'], list(result))
示例#22
0
 def test_joined_leadingUpper(self):
     counter = WordCounter()
     result = counter._uncamelCase('LeadingUpper')
     self.assertEquals(['Leading', 'Upper'], list(result))
示例#23
0
 def test_leading_upper(self):
     counter = WordCounter()
     result = counter._uncamelCase('THIS')
     self.assertEquals(['THIS'], list(result))
示例#24
0
 def test_end_with_acronym(self):
     counter = WordCounter()
     result = counter._uncamelCase('endWithHTML')
     self.assertEquals(['end', 'With', 'HTML'], list(result))
示例#25
0
 def test_joined_leadingLower(self):
     counter = WordCounter()
     result = counter._uncamelCase('thisTime')
     self.assertEquals(['this','Time'], list(result))
示例#26
0
 def test_acronym_in_middle(self):
     counter = WordCounter()
     result = counter._uncamelCase('embedHTMLHere')
     self.assertEquals(['embed', 'HTML', 'Here'], list(result))
示例#27
0
 def test_end_with_acronym(self):
     counter = WordCounter()
     result = counter._uncamelCase('endWithHTML')
     self.assertEquals(['end','With','HTML'], list(result))
示例#28
0
 def test_word_count(self):
     word_list=['test','this','this','that','is','test','is','test','is','of','yes']
     word_counter=WordCounter(words=word_list) 
     self.assertEqual(word_counter.wordcount,[('test', 3), ('is', 3), ('this', 2), ('that', 1), ('of', 1), ('yes', 1)])
示例#29
0
import os
from WordCounter import WordCounter,FileWordExtractor

FILEPATH=os.path.join(os.path.dirname(__file__), 'input.txt')

try:
    if os.path.exists(FILEPATH):
        text_file=FileWordExtractor(FILEPATH)
        words=text_file.words
        word_counter=WordCounter(words=words) #init wordcounter class
        word_count_dict=word_counter.wordcount #get wordcount
        text_file.display_total_words()
        word_counter.display_word_counts()
    else:
        print('File not found')
except: 
    print('Application Error')

        
示例#30
0
Results
-------
    - All output is written to output file (output.txt).

Notes
-----
    - The program ignores case sensitivity from the input file
    - All special characters from the input file will be filtered out.
    - This program will call functions from WordCounter.py
"""

import sys
import os
from WordCounter import WordCounter

# validate number of input arguments
if len(sys.argv) < 2:
    print("filename needed as command argument")
    sys.exit(1)

filename = sys.argv[1]

# validate input file
if os.path.isfile(filename) == False:
    print(f"Error: {filename} is not a file.")
    print("Exit Program")
    sys.exit(1)

wc = WordCounter(filename)
wc.generateHistogram()