def frequent_word_statistics(self): try: word_collection = [] word_frequency_collection = [] page_types = ["index", "login", "register"] for pageType in page_types: file = open("categories/" + pageType + "/url.txt") urls = file.read().split("\n") for url in urls: for word in self.fetch_words(url, 2, 6): word_collection = self.collect(word_collection, word) if word_frequency_collection.__contains__( WordFrequency(word)): for word_frequency in word_frequency_collection: if word_frequency.word == word: word_frequency.frequency += 1 break else: word_frequency_collection.append( WordFrequency(word)) word_file = open("categories/words.txt", 'w', encoding='utf-8') vector_file = open("categories/all/vector.txt", 'w', encoding='utf-8') word_frequency_collection.sort(key=lambda wf: wf.frequency) word_frequency_collection.reverse() for word_frequency in word_frequency_collection: print(word_frequency.word + " " + str(word_frequency.frequency)) word_file.write(word_frequency.word + "\n") vector_file.write(str(word_frequency.frequency) + "\n") except FileNotFoundError as e: print(e) sys.exit(1)
def calc_frequencies(self): for word in self.filtered_words: frequency = self.get_word_frequency(word) if frequency is None: frequency = WordFrequency(word, self.words_total) self.word_frequencies.append(frequency) else: frequency.add()
def calc_texts_frequencies(self): for text_number in range(self.texts_total): words = self.words_matrix[text_number] for word in words: text_frequency = self.get_word_frequency(word, text_number) if text_frequency is None: text_frequency = WordFrequency(word, len(word)) self.word_frequencies_matrix[text_number].append( text_frequency) else: text_frequency.add()
from WordFrequency import WordFrequency import sys import json import operator from collections import OrderedDict from collections import Counter if __name__ == '__main__': # Creates an instance of our MRJob subclass job=WordFrequency(args=sys.argv[1:]) with job.make_runner() as runner: # Run the job runner.run() #print json.dumps(data) # Process the output f = open("result.txt", "w") data=OrderedDict() for line in runner.stream_output(): key, value = job.parse_output_line(line) print 'key:', key, 'value:', value f.write(str(str(key)+" "+str(value))) f.write("\n") f.close()
def edits1(word): splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [a + b[1:] for a, b in splits if b] transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1] replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] inserts = [a + c + b for a, b in splits for c in alphabet] return set(deletes + transposes + replaces + inserts) def datafile(name, sep='\t'): """Read key,value pairs from file.""" for line in file(name): yield line.split(sep) edit_frequencies = WordFrequency.from_freq_file('Norvig/edits/count_1edit.txt') p_spell_error = 1./20. def p_edit(edit): """The probability of an edit; can be '' or 'a|b' """ if edit == '': return 1. - p_spell_error return p_spell_error * edit_frequencies.get_probability(edit) PREFIXES = set(w[:i] for w in Pw for i in range(len(w) + 1)) def edits(word, dictionary, d=2):
from Context import Context from EditDistance import edits from WordFrequency import WordFrequency import re import Readers count_1w = WordFrequency.from_freq_file("data/Norvig/wordfreqs/count_1w.txt") count_2w = WordFrequency.from_freq_file("data/Norvig/wordfreqs/count_2w.txt") def corrections(text): "Spell-correct all words in text." return re.sub('[a-zA-Z]+', lambda m: correct(m.group(0)), text) def correct(context): "Return the word that is the most likely spell correction of w." candidates = edits(context.word()).items() #c, edit = max(candidates, key=lambda (c,e): Pedit(e) * Pw(c)) #return c for context in Context.gen_context_sequence_from_word_sequence(Readers.gen_words_from_file("doc.txt")): correct(context)