def exercise_3(seed='probability', depth=5): """ Markov chain text generator (words). See: exercises/lab01.pdf -> Exercise 5 """ output_length = 10**4 corpus = get_file_content('files/lab01/norm_wiki_sample.txt').split() ngrams = defaultdict(lambda: defaultdict(int)) for word_idx in range(depth, len(corpus)): previous_ngram, current_word = ' '.join( corpus[word_idx - depth:word_idx]), corpus[word_idx] ngrams[previous_ngram][current_word] += 1 output_text = seed.split() if seed else [] for _ in range(output_length): current_ngram = ' '.join(output_text[-depth:]) if current_ngram in ngrams.keys(): freq_of_next_words = ngrams[current_ngram] words, probabilities = zip(*freq_of_next_words.items()) probabilities = np.array(probabilities, dtype=float) probabilities /= probabilities.sum() output_text.append(choice(a=words, p=probabilities)) else: output_text.append(corpus[randint(len(corpus))]) return ' '.join(output_text)
def exercise_1(): """ See: exercises/lab02.pdf -> Exercise 1 """ words = Counter( get_file_content('files/lab01/norm_wiki_sample.txt').split()) counter_total = sum(words.values()) first_6000_values = sum_n_counter_values(words, 6000) / counter_total first_30000_values = sum_n_counter_values(words, 30000) / counter_total return [first_6000_values, first_30000_values]
def exercise_1(): """ See: exercises/lab03.pdf -> Exercise 1 """ length_of_the_alphabet = len(english_alphabet_generator(numbers=True)) probabities = [1 / length_of_the_alphabet] * length_of_the_alphabet pure_alphabet_entropy = calculate_entropy(probabities) wiki_file_counter = Counter( get_file_content('files/lab01/norm_wiki_sample.txt')) _, values = zip(*wiki_file_counter.most_common(None)) wiki_probabilities = conver_array_to_probabilities(values) wiki_alphabet_entropy = calculate_entropy(wiki_probabilities) return (pure_alphabet_entropy, wiki_alphabet_entropy)
def calculate_conditional_entropy_on_file(file, depth, words=False): depth += 1 # Read file content and transform it to list of chars/words. file_content = list(get_file_content( file)) if not words else get_file_content(file).split() # Prepare counters for every possible ngram ngram_counter = defaultdict(int) base_counter = defaultdict(int) # Count ngrams and their prefixes. for item_idx in range(depth, len(file_content)): if words: ngram_counter[' '.join(file_content[item_idx - depth:item_idx])] += 1 base_counter[' '.join(file_content[item_idx - depth:item_idx - 1])] += 1 else: ngram_counter[''.join(file_content[item_idx - depth:item_idx])] += 1 base_counter[''.join(file_content[item_idx - depth:item_idx - 1])] += 1 num_engrams = len(file_content) - depth # Calculate entropy entropy = 0 for ngram in ngram_counter: prob_of_ngram = ngram_counter[ngram] / num_engrams cond_prob = ngram_counter[ngram] / base_counter[' '.join(ngram.split( )[:-1])] if words else ngram_counter[ngram] / base_counter[ngram[:-1]] entropy -= prob_of_ngram * np.log2(cond_prob) return entropy
def exercise_2(): """ See: exercises/lab02.pdf -> Exercise 2 """ words = Counter( get_file_content('files/lab01/norm_wiki_sample.txt').split()) values, probabilities = zip(*words.items()) probabilities = np.array(probabilities, dtype=float) probabilities /= probabilities.sum() output_text_length = 10**3 output_text = [ choice(a=values, p=probabilities) for _ in range(output_text_length) ] return ' '.join(output_text)