def sampling(text_file): '''Calling the function that returns the random word ''' histogram = histogram_dict(text_file) count = get_count(histogram_dict(text_file)) total = len(text_file) ' '.join(sentence(count, total, histogram)) print(sentence(count, total, histogram))
def test(words_list): test = [] histogram_ = histogram.histogram_dict(words_list) for _ in range(10000): test.append(sample(words_list)) hist_ran_words = histogram.histogram_dict(test) for word in hist_ran_words: print(word, 'test', hist_ran_words[word]/10000, 'distribution', round((histogram_[word]/len(words_list)), 4)) return
def sample(input): dict = histogram_dict(input) num = randint(0, sum(dict.values())) for i in dict: num -= dict[i] if num <= 0: return i
def main_sample(words_file): words = histogram_dict(words_file) count = sum(words.values()) word = random_word(words, count) return (word)
def test_sample_freq(histogram): word_freq = [] for _ in range(2000): sample = sample_freq(histogram) word_freq.append(sample) return histogram_dict(word_freq)
def main_sample(text_file): '''Calling the function that returns the random word ''' histogram = histogram_dict(text_file) count = get_count(histogram) word = prob_word(histogram, count) display_word = print(word) return display_word
def dictionary_of_list_counts(word_dict): dict_to_return = {} for k, v in word_dict.items(): dict_to_return[k] = histogram.histogram_dict(v) # for i in v: # if k in dict_to_return: # dict_to_return[k] = {i: v.count(i)} # print(dict_to_return) return dict_to_return
def get_word_distribution(words_list): histo = histogram.histogram_dict(words_list) word_distribution = {} range_start = 1 for word in histo: word_distribution[word] = list( range(range_start, range_start + histo[word])) range_start += histo[word] return word_distribution
def create_dict_from_list(clean_list): '''Take a clean list, return dictionary structure. Each unique ('word', 'second_wrd') stored as key : frequency of the tuple stored as value.''' list_of_pairs = [] for index in range(0, len(clean_list) - 1): cur = clean_list[index] nxt = clean_list[index + 1] list_of_pairs.append((cur, nxt)) dictionary = h.histogram_dict(list_of_pairs) return dictionary
def create_dictionary_from_list(clean_list): """Take a list and return an dictionary""" list_of_pairs = [] for index in range(0, len(clean_list) - 1): current = clean_list[index] next = clean_list[index + 1] list_of_pairs.append((current, next)) dictionary = h.histogram_dict(list_of_pairs) return dictionary
def build_text_histogram(self, words_list): return histogram.histogram_dict(self.words_list)
def sample_freq(histogram): # Length of the histogram based off values hist_length = sum(histogram.values()) word_predict = randint(1, hist_length) counter = 0 for word in histogram.keys(): # Adds the histogram value to the counter counter += histogram[word] if word_predict <= counter: return word def test_sample_freq(histogram): word_freq = [] for _ in range(2000): sample = sample_freq(histogram) word_freq.append(sample) return histogram_dict(word_freq) if __name__ == "__main__": source = 'small_sample.txt' # sample = sample_freq(histogram_dict(read_file(source))) # print(sample) test = test_sample_freq(histogram_dict(read_file(source))) print(test)
import random from histogram import histogram_dict def random_word(histogram, count): '''returns a random word ''' total = 0 index = random.randint(1, count) for key, value in histogram.items(): total += value if index <= total: return key def main_sample(words_file): words = histogram_dict(words_file) count = sum(words.values()) word = random_word(words, count) return (word) if __name__ == '__main__': words_file = 'test.txt' histogram = histogram_dict(words_file) print(random_word(histogram, 100))
def main_sample(source_text): words = histogram_dict(source_text) count = sum(words.values()) word = prob_word(words, count) return word
def ensure_randomness(histogram): return histogram_dict([random_word(histogram) for i in range(10000)])
def ensure_probability(histogram): probs = probabilities(histogram) return histogram_dict( [probabilistic_random_word(probs) for i in range(10000)])
def ensure_probability(histogram): probs = probabilities(histogram) return histogram_dict([probabilistic_random_word(probs) for i in range(10000)])
return word def probabilities(freq_dist): probabilities = [] prob = 0 for word, freq in freq_dist.items(): prob = round(prob + probability(word, freq_dist), 4) probabilities.append((prob, word)) return sorted(probabilities) def probability(word, freq_dist): numWords = 0 for _, freq in freq_dist.items(): numWords += freq return round(freq_dist[word] / numWords, 4) def ensure_randomness(histogram): return histogram_dict([random_word(histogram) for i in range(10000)]) def ensure_probability(histogram): probs = probabilities(histogram) return histogram_dict([probabilistic_random_word(probs) for i in range(10000)]) if __name__ == '__main__': import sys words = open(sys.argv[1], 'r').read().split() hstgm = histogram_dict(words) probs = probabilities(hstgm) print(probabilistic_random_word(probs)) print(ensure_probability(hstgm))
import random from histogram import histogram_dict def prob_word(histogram, count): '''returns a random word ''' total = 0 index = random.randint(1, count) for key, value in histogram.items(): total += value if index <= total: return key def main_sample(source_text): words = histogram_dict(source_text) count = sum(words.values()) word = prob_word(words, count) return word if __name__ == '__main__': source_text = 'book.txt' histogram = histogram_dict(source_text) print(prob_word(histogram, 21))
for word, freq in freq_dist.items(): prob = round(prob + probability(word, freq_dist), 4) probabilities.append((prob, word)) return sorted(probabilities) def probability(word, freq_dist): numWords = 0 for _, freq in freq_dist.items(): numWords += freq return round(freq_dist[word] / numWords, 4) def ensure_randomness(histogram): return histogram_dict([random_word(histogram) for i in range(10000)]) def ensure_probability(histogram): probs = probabilities(histogram) return histogram_dict( [probabilistic_random_word(probs) for i in range(10000)]) if __name__ == '__main__': import sys words = open(sys.argv[1], 'r').read().split() hstgm = histogram_dict(words) probs = probabilities(hstgm) print(probabilistic_random_word(probs)) print(ensure_probability(hstgm))