Пример #1
0
def test_sampler(histogram):
    """ tests sampler by running it 10000 times, uncomment line 36 for this"""
    test_words = []
    for _ in range(10000):
        test_words.append(sample(histogram))
    test_hist = histogram_dictionary(test_words)
    for item in test_hist.keys():
        print(f"{item}: {test_hist[item]}")
Пример #2
0
def sentence_creator(histogram, num_words):
    """ tests sampler by running it 10000 times, uncomment line 36 for this"""
    words = []
    sentence = []
    for _ in range(0, num_words):
        words.append(sample(histogram))
    hist = histogram_dictionary(words)
    for item in hist.keys():
        sentence.append(item)
    sentence_str = " ".join(sentence)
    return sentence_str  #sentence
Пример #3
0
    word_list = words[word_index]

    return word_list


def probability(histo):
    """ equation for probability = (num of occurances) / sample total
        Take the value of a word. Divide it by the total length of the histogram. 
        Mulitply by 100 to get percentage.
        Be able to show all of the words and their percent in a list. """

    total_count = len(histo.values(
    ))  # this is the total number of items in the histo. Should be 61.
    # print(total_count) # yes, 61
    word_percents = {}
    for key in histo:  # for the key in the histogram
        word_percents[key] = str(
            round(histo[key] / total_count * 100, 2)
        ) + "%"  # divide it by the total items in the histogram and divide by 100
        # round(_, which decimal point) <--- this will round the number to the second decimal point

    return word_percents
    # print(word_percents)


if __name__ == '__main__':

    with open("sample.txt", "r") as data:
        histo = histogram_dictionary(data)
        random_word(histo)
        probability(histo)
Пример #4
0
        words.append(sample(histogram))
    hist = histogram_dictionary(words)
    for item in hist.keys():
        sentence.append(item)
    sentence_str = " ".join(sentence)
    return sentence_str  #sentence


def test_sampler(histogram):
    """ tests sampler by running it 10000 times, uncomment line 36 for this"""
    test_words = []
    for _ in range(10000):
        test_words.append(sample(histogram))
    test_hist = histogram_dictionary(test_words)
    for item in test_hist.keys():
        print(f"{item}: {test_hist[item]}")


if __name__ == "__main__":
    # file = sys.argv[:1]
    # num_words = sys.argv[:1]
    num_words = 8

    f = open('harry_potterb1.txt')
    words = f.read().split()
    hist = histogram_dictionary(words)
    print(sentence_creator(hist, num_words))

# Tests
# print(test_sampler(hist, num_words))
# print(word_sampler(hist))
Пример #5
0
    return: a histogram
    histogram creation function meant to check if words look like they are correctly sampled
    """

    random_word_histogram = {}
    for word in word_list:
        if word in random_word_histogram:
            random_word_histogram[word] += 1
        else:
            random_word_histogram[word] = 1

    return random_word_histogram


if __name__ == "__main__":
    text_document = sys.argv[1]
    histo = histogram.histogram_dictionary(text_document)

    random_words = [random_sampling(histo) for _ in range(10000)]
    # print(random_check(random_words))

    start = time.time()
    random_freq_words = [weighted_dict_sampling(histo) for _ in range(100000)]
    finish = time.time()
    print(finish - start)

    print(random_check(random_freq_words))

    for _ in range(10):
        print(weighted_dict_sampling(histo))
Пример #6
0
            histogram_list[index + 1][1])
        count += to_add
        index += 1

    type_index = random.randrange(len(histogram_list[index][1]))

    return histogram_list[index][1][type_index]


if __name__ == "__main__":
    file_name = "notess.txt"

    #print(sample_lists_of_lists(file_name))
    text = histogram.load_words(file_name)
    text_length = len(text)
    histogram_dict = histogram.histogram_dictionary(text)
    print(get_probablities(histogram_dict, text_length))
    '''current = time.time()
    words = list()


    print(time.time() - current)

    samples = histogram.histogram_count_lists_try_catch(words)
    print(samples)

    #text = histogram.load_words(file_name)
    histogram_list2 = histogram.histogram_dictionary(text)
    current2 = time.time()
    words = list()
    for i in range(10000):
Пример #7
0
import histogram
import sys
import random

# Helper function for getting the total number of objects in the histogram
get_total(histogram):


# Take the token(total number of objects) and divide the frequency of the words by tokens to get a decimal. Then use that to select a random word based on frequency.
def sample_by_frequency(histogram):
    # Get the "weight" of every letter together
    token = len(histogram)
    # Get the lucky index
    random_index = random.randint(1, token)

    # Iterate through all the words 
    for word in histogram:
        # Subtract the words value from the histogram
        random_index = random_index - word.values
        if random_index <= 0:
            return word
            


if __name__ == "__main__":
    source_text = 'harry_potterb1.txt'
    histogram = histogram.histogram_dictionary(source_text)
    sample_by_frequency(histogram)
Пример #8
0
def main():
    book = 'sherlock_no_title_chapters.txt'
    word_list = histogram.generate_word_list(book)
    histo = histogram.histogram_dictionary(word_list)
    print(sample_by_frequency(histo))