Пример #1
0
def exercise_3(seed='probability', depth=5):
    """ Markov chain text generator (words).
        See: exercises/lab01.pdf -> Exercise 5 """

    output_length = 10**4
    corpus = get_file_content('files/lab01/norm_wiki_sample.txt').split()

    ngrams = defaultdict(lambda: defaultdict(int))

    for word_idx in range(depth, len(corpus)):
        previous_ngram, current_word = ' '.join(
            corpus[word_idx - depth:word_idx]), corpus[word_idx]
        ngrams[previous_ngram][current_word] += 1

    output_text = seed.split() if seed else []

    for _ in range(output_length):

        current_ngram = ' '.join(output_text[-depth:])

        if current_ngram in ngrams.keys():

            freq_of_next_words = ngrams[current_ngram]

            words, probabilities = zip(*freq_of_next_words.items())
            probabilities = np.array(probabilities, dtype=float)
            probabilities /= probabilities.sum()

            output_text.append(choice(a=words, p=probabilities))

        else:
            output_text.append(corpus[randint(len(corpus))])

    return ' '.join(output_text)
Пример #2
0
def exercise_1():
    """ See: exercises/lab02.pdf -> Exercise 1 """
    words = Counter(
        get_file_content('files/lab01/norm_wiki_sample.txt').split())
    counter_total = sum(words.values())

    first_6000_values = sum_n_counter_values(words, 6000) / counter_total
    first_30000_values = sum_n_counter_values(words, 30000) / counter_total

    return [first_6000_values, first_30000_values]
def exercise_1():
    """ See: exercises/lab03.pdf -> Exercise 1 """

    length_of_the_alphabet = len(english_alphabet_generator(numbers=True))
    probabities = [1 / length_of_the_alphabet] * length_of_the_alphabet
    pure_alphabet_entropy = calculate_entropy(probabities)

    wiki_file_counter = Counter(
        get_file_content('files/lab01/norm_wiki_sample.txt'))
    _, values = zip(*wiki_file_counter.most_common(None))
    wiki_probabilities = conver_array_to_probabilities(values)
    wiki_alphabet_entropy = calculate_entropy(wiki_probabilities)

    return (pure_alphabet_entropy, wiki_alphabet_entropy)
def calculate_conditional_entropy_on_file(file, depth, words=False):

    depth += 1

    # Read file content and transform it to list of chars/words.
    file_content = list(get_file_content(
        file)) if not words else get_file_content(file).split()

    # Prepare counters for every possible ngram
    ngram_counter = defaultdict(int)
    base_counter = defaultdict(int)

    # Count ngrams and their prefixes.
    for item_idx in range(depth, len(file_content)):
        if words:
            ngram_counter[' '.join(file_content[item_idx -
                                                depth:item_idx])] += 1
            base_counter[' '.join(file_content[item_idx - depth:item_idx -
                                               1])] += 1
        else:
            ngram_counter[''.join(file_content[item_idx -
                                               depth:item_idx])] += 1
            base_counter[''.join(file_content[item_idx - depth:item_idx -
                                              1])] += 1

    num_engrams = len(file_content) - depth

    # Calculate entropy
    entropy = 0
    for ngram in ngram_counter:
        prob_of_ngram = ngram_counter[ngram] / num_engrams
        cond_prob = ngram_counter[ngram] / base_counter[' '.join(ngram.split(
        )[:-1])] if words else ngram_counter[ngram] / base_counter[ngram[:-1]]
        entropy -= prob_of_ngram * np.log2(cond_prob)

    return entropy
Пример #5
0
def exercise_2():
    """ See: exercises/lab02.pdf -> Exercise 2 """

    words = Counter(
        get_file_content('files/lab01/norm_wiki_sample.txt').split())
    values, probabilities = zip(*words.items())

    probabilities = np.array(probabilities, dtype=float)
    probabilities /= probabilities.sum()

    output_text_length = 10**3
    output_text = [
        choice(a=values, p=probabilities) for _ in range(output_text_length)
    ]

    return ' '.join(output_text)