示例#1
0
def generate_coin_flip_distribution_offset(max_number_of_flips, flip_count_multiplier=1.1):

    flip_counts = []
    head_percentages = []
    number_of_flips = 2
    while number_of_flips < max_number_of_flips:
        logging.info("Generating " + str(number_of_flips) + " coin flips")

        # Flip the coin over and over and report back the number of heads
        # so we can then determine the ratio of heads
        number_of_heads = flip_a_coin(number_of_flips)
        ratio_of_heads = float(number_of_heads) / number_of_flips
        flip_counts.extend([number_of_flips])

        # Whatever number we get, unless it was exactly .5, it was off from the ideal.  Record
        # that offset from the expected so we can plot it.
        error_from_expected = abs(.5 - ratio_of_heads)
        head_percentages.extend([error_from_expected])

        # It would take forever to walk from 1 to a million, but it's not too bad if
        # we multiply the number of coin flip trials each time instead of adding.
        number_of_flips = int(number_of_flips * flip_count_multiplier) + 1

    # output a text variation of the generated percentages
    logging.debug(str(flip_counts))
    logging.debug(str(head_percentages))

    # we don't have room to display all number labels, so eliminate all but 8
    x_label_step_size = len(flip_counts) / 8
    for i in range(0, len(flip_counts)):
        if i % x_label_step_size:
            flip_counts[i] = ""

    # now generate a plot
    charting.bar_chart("coin_flip.png", [head_percentages],
                       "Heads Flips - Offset from Ideal (" + str(max_number_of_flips) + ")",
                       flip_counts,
                       "Offset from .5 - Larger is Worse",
                       None,
                       ['#59799e'],
                       0,
                       0,
                       False,
                       .5,
                       "none")
示例#2
0
def compare_stemming_to_lemmatization():

    # load each of the corpora
    abc_words = nltk.corpus.abc.words()
    genesis_words = nltk.corpus.genesis.words()
    gutenberg_words = nltk.corpus.gutenberg.words()
    inaugural_words = nltk.corpus.inaugural.words()
    state_union_words = nltk.corpus.state_union.words()
    webtext_words = nltk.corpus.webtext.words()

    all_words = [
        abc_words, genesis_words, gutenberg_words, inaugural_words,
        state_union_words, webtext_words
    ]
    corpora_names = [
        "ABC", "Genesis", "Gutenberg", "Inaugural", "Union", "Web"
    ]

    word_counts = []
    lemmatized_counts = []
    stemmed_counts = []

    # iterate through each corpus and generate counts of the unique tokens
    # in each
    for index, words in enumerate(all_words):
        logging.debug("Lemmatizing " + corpora_names[index])
        lemmatized = collect_term_counts(lemmatize_words_array(words))
        logging.debug("Stemming " + corpora_names[index])
        stemmed = collect_term_counts(stem_words_array(words))
        word_counts.extend([len(collect_term_counts(words))])
        lemmatized_counts.extend([len(lemmatized)])
        stemmed_counts.extend([len(stemmed)])

    logging.info("Corpora: " + str(corpora_names))
    logging.info("Word Counts: " + str(word_counts))
    logging.info("Lemmatized Word Counts: " + str(lemmatized_counts))
    logging.info("Stemmed Word Counts: " + str(stemmed_counts))

    # output a bar chart illustrating the above
    charting.bar_chart("stemming_vs_lemmatization.png",
                       [word_counts, lemmatized_counts, stemmed_counts],
                       "Token Counts for Words, Stems and Lemmas",
                       corpora_names, "Token Counts",
                       ["Words", "Stems", "Lemmas"],
                       ['#59799e', '#810CE8', '#FF0000'], .5)
示例#3
0
def compare_stemming_to_lemmatization():

    # load each of the corpora
    abc_words = nltk.corpus.abc.words()
    genesis_words = nltk.corpus.genesis.words()
    gutenberg_words = nltk.corpus.gutenberg.words()
    inaugural_words = nltk.corpus.inaugural.words()
    state_union_words = nltk.corpus.state_union.words()
    webtext_words = nltk.corpus.webtext.words()

    all_words = [abc_words, genesis_words, gutenberg_words, inaugural_words, state_union_words, webtext_words]
    corpora_names = ["ABC", "Genesis", "Gutenberg", "Inaugural", "Union", "Web"]

    word_counts = []
    lemmatized_counts = []
    stemmed_counts = []

    # iterate through each corpus and generate counts of the unique tokens
    # in each
    for index, words in enumerate(all_words):
        logging.debug("Lemmatizing " + corpora_names[index])
        lemmatized = collect_term_counts(lemmatize_words_array(words))
        logging.debug("Stemming " + corpora_names[index])
        stemmed = collect_term_counts(stem_words_array(words))
        word_counts.extend([len(collect_term_counts(words))])
        lemmatized_counts.extend([len(lemmatized)])
        stemmed_counts.extend([len(stemmed)])

    logging.info("Corpora: " + str(corpora_names))
    logging.info("Word Counts: " + str(word_counts))
    logging.info("Lemmatized Word Counts: " + str(lemmatized_counts))
    logging.info("Stemmed Word Counts: " + str(stemmed_counts))

    # output a bar chart illustrating the above
    charting.bar_chart(
        "stemming_vs_lemmatization.png",
        [word_counts, lemmatized_counts, stemmed_counts],
        "Token Counts for Words, Stems and Lemmas",
        corpora_names,
        "Token Counts",
        ["Words", "Lemmas", "Stems"],
        ["#59799e", "#810CE8", "#FF0000"],
        0.5,
    )
示例#4
0
def chart_term_frequencies(file_name,
                           title,
                           y_axis,
                           term_frequencies,
                           indexes=numpy.arange(5)):
    chart_terms = []
    chart_frequencies = []
    selected_frequencies = []
    for index in indexes:
        selected_frequencies.append(term_frequencies[index])

    for term, frequency in selected_frequencies:
        chart_terms.extend([term])
        chart_frequencies.append([frequency])

    charting.bar_chart(
        file_name, chart_frequencies, title, None, y_axis, chart_terms,
        ['#59799e', '#810CE8', '#FF0000', '#12995D', '#FD53FF', '#AA55CC'], 1,
        0.2)
示例#5
0
def collect_and_output_frequency_frequencies(corpus, corpus_name,
                                             term_frequencies):
    if term_frequencies is None:
        term_frequencies = collect_term_counts(corpus)

    frequency_frequencies = {}
    for term, frequency in term_frequencies.iteritems():
        if frequency_frequencies.has_key(frequency):
            frequency_frequencies[frequency] += 1
        else:
            frequency_frequencies[frequency] = 1

    unsorted_array = [[key, value]
                      for key, value in frequency_frequencies.iteritems()]
    sorted_array = sorted(
        unsorted_array,
        key=lambda frequency_frequency: frequency_frequency[1],
        reverse=True)

    frequency_frequencies_to_chart = []
    frequencies_to_chart = []
    output_csv_file = open_csv_file("frequency_frequencies.csv",
                                    ["Frequency Frequency", "Term Frequency"])

    # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional
    # array.  Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart.
    # This means there is exactly 1 data set and 6 columns of data in the set.  There is no second set to compare
    # it to.
    for index, (term_frequency,
                frequency_frequency) in enumerate(sorted_array):
        output_csv_file.writerow([frequency_frequency] + [term_frequency])
        if index <= 20:
            frequencies_to_chart.extend([term_frequency])
            frequency_frequencies_to_chart.extend([frequency_frequency])

    charting.bar_chart(
        "frequency_frequencies.png", [frequency_frequencies_to_chart],
        "Frequency Frequencies (" + corpus_name + ")", frequencies_to_chart,
        "Frequency Frequency", None,
        ['#59799e', '#810CE8', '#FF0000', '#12995D', '#FD53FF', '#AA55CC'],
        0.2, 0.0)

    return frequency_frequencies
示例#6
0
def collect_and_output_frequency_frequencies(corpus, corpus_name, term_frequencies):
    if term_frequencies is None:
        term_frequencies = collect_term_counts(corpus)

    frequency_frequencies = {}
    for term, frequency in term_frequencies.iteritems():
        if frequency_frequencies.has_key(frequency):
            frequency_frequencies[frequency] += 1
        else:
            frequency_frequencies[frequency] = 1

    unsorted_array = [[key, value] for key, value in frequency_frequencies.iteritems()]
    sorted_array = sorted(unsorted_array, key=lambda frequency_frequency: frequency_frequency[1], reverse=True)

    frequency_frequencies_to_chart = []
    frequencies_to_chart = []
    output_csv_file = fs.open_csv_file("frequency_frequencies.csv", ["Frequency Frequency", "Term Frequency"])

    # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional
    # array.  Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart.
    # This means there is exactly 1 data set and 6 columns of data in the set.  There is no second set to compare
    # it to.
    for index, (term_frequency, frequency_frequency) in enumerate(sorted_array):
        output_csv_file.writerow([frequency_frequency] + [term_frequency])
        if index <= 20:
            frequencies_to_chart.extend([term_frequency])
            frequency_frequencies_to_chart.extend([frequency_frequency])

    charting.bar_chart(
        "frequency_frequencies.png",
        [frequency_frequencies_to_chart],
        "Frequency Frequencies (" + corpus_name + ")",
        frequencies_to_chart,
        "Frequency Frequency",
        None,
        ["#59799e", "#810CE8", "#FF0000", "#12995D", "#FD53FF", "#AA55CC"],
        0.2,
        0.0,
    )

    return frequency_frequencies
示例#7
0
def generate_coin_flip_distribution_offset(max_number_of_flips,
                                           flip_count_multiplier=1.1):

    flip_counts = []
    head_percentages = []
    number_of_flips = 2
    while number_of_flips < max_number_of_flips:
        logging.info("Generating " + str(number_of_flips) + " coin flips")

        # Flip the coin over and over and report back the number of heads
        # so we can then determine the ratio of heads
        number_of_heads = flip_a_coin(number_of_flips)
        ratio_of_heads = float(number_of_heads) / number_of_flips
        flip_counts.extend([number_of_flips])

        # Whatever number we get, unless it was exactly .5, it was off from the ideal.  Record
        # that offset from the expected so we can plot it.
        error_from_expected = abs(.5 - ratio_of_heads)
        head_percentages.extend([error_from_expected])

        # It would take forever to walk from 1 to a million, but it's not too bad if
        # we multiply the number of coin flip trials each time instead of adding.
        number_of_flips = int(number_of_flips * flip_count_multiplier) + 1

    # output a text variation of the generated percentages
    logging.debug(str(flip_counts))
    logging.debug(str(head_percentages))

    # we don't have room to display all number labels, so eliminate all but 8
    x_label_step_size = len(flip_counts) / 8
    for i in range(0, len(flip_counts)):
        if i % x_label_step_size:
            flip_counts[i] = ""

    # now generate a plot
    charting.bar_chart(
        "coin_flip.png", [head_percentages],
        "Heads Flips - Offset from Ideal (" + str(max_number_of_flips) + ")",
        flip_counts, "Offset from .5 - Larger is Worse", None, ['#59799e'], 0,
        0, False, .5, "none")
示例#8
0
def chart_term_frequencies(file_name, title, y_axis, term_frequencies, indexes=numpy.arange(5)):
    chart_terms = []
    chart_frequencies = []
    selected_frequencies = []
    for index in indexes:
        selected_frequencies.append(term_frequencies[index])

    for term, frequency in selected_frequencies:
        chart_terms.extend([term])
        chart_frequencies.append([frequency])

    charting.bar_chart(
        file_name,
        chart_frequencies,
        title,
        None,
        y_axis,
        chart_terms,
        ["#59799e", "#810CE8", "#FF0000", "#12995D", "#FD53FF", "#AA55CC"],
        1,
        0.2,
    )
示例#9
0
def marbles_and_jars(num_trials):

    # read in the csv file of jars
    rows = fs.read_csv("marbles.csv")
    logging.debug("Read rows: " + str(rows))

    jars = {}
    headers = []
    marble_picks = {}

    # go through the rows and build a dictionary of jar_name => array of marble colors
    for index, row in enumerate(rows):
        # first row is just header data
        if index == 0:
            headers = row
        else:
            # go through each of the headers (these are columns)
            for column_index, header in enumerate(headers):
                # if the first column than it's the name of the jar - initialize the array to empty (no marbles)
                if column_index == 0:
                    jars[row[0]] = []
                else:
                    # each other column represents a number of marbles, the name of the marble is in the header
                    marble_color = header

                    # initialize the counters for picking marbles for the given color
                    marble_picks[marble_color] = 0

                    # set blank cells to 0, otherwise add the value in the cell
                    if len(row[column_index]) == 0:
                        num_marbles = 0
                    else:
                        num_marbles = int(row[column_index])

                    # expand an array of colors, 1 element for each num_marbles
                    jars[row[0]] += [marble_color] * num_marbles

    logging.info("Jars: " + str(jars))

    for i in range(0, num_trials):
        # pick a random jar from all of the jars w/out taking the marbles into consideration
        jar_names = jars.keys()
        jar_name = jar_names[random.randint(0, len(jar_names) - 1)]

        # now draw a single marble from all the marbles given that we selected a jar
        marbles = jars[jar_name];
        marble = marbles[random.randint(0, len(marbles) - 1)]
        marble_picks[marble] += 1

    logging.info("Marble picks : " + str(marble_picks))

    # prepare the data for plotting
    keys = []
    data = []
    for key, value in marble_picks.iteritems():
        column_name = key + " (" + str(value) + ")"
        keys.extend([column_name])
        data.extend([value/float(num_trials)])

    description_list = []
    for jar_name, jar_marbles in jars.iteritems():
        description_list.append(jar_name + "(" + str(len(jar_marbles)) + ")")
    description = ", ".join(description_list)

    # plot the data
    charting.bar_chart("marbles.png",
                       [data],
                       "Marbles in Jars (" + str(num_trials) + ") - " + description,
                       keys,
                       "Probabilities",
                       None,
                       ['#59799e'])
示例#10
0
def marbles_and_jars(num_trials):

    # read in the csv file of jars
    rows = fs.read_csv("marbles.csv")
    logging.debug("Read rows: " + str(rows))

    jars = {}
    headers = []
    marble_picks = {}

    # go through the rows and build a dictionary of jar_name => array of marble colors
    for index, row in enumerate(rows):
        # first row is just header data
        if index == 0:
            headers = row
        else:
            # go through each of the headers (these are columns)
            for column_index, header in enumerate(headers):
                # if the first column than it's the name of the jar - initialize the array to empty (no marbles)
                if column_index == 0:
                    jars[row[0]] = []
                else:
                    # each other column represents a number of marbles, the name of the marble is in the header
                    marble_color = header

                    # initialize the counters for picking marbles for the given color
                    marble_picks[marble_color] = 0

                    # set blank cells to 0, otherwise add the value in the cell
                    if len(row[column_index]) == 0:
                        num_marbles = 0
                    else:
                        num_marbles = int(row[column_index])

                    # expand an array of colors, 1 element for each num_marbles
                    jars[row[0]] += [marble_color] * num_marbles

    logging.info("Jars: " + str(jars))

    for i in range(0, num_trials):
        # pick a random jar from all of the jars w/out taking the marbles into consideration
        jar_names = jars.keys()
        jar_name = jar_names[random.randint(0, len(jar_names) - 1)]

        # now draw a single marble from all the marbles given that we selected a jar
        marbles = jars[jar_name]
        marble = marbles[random.randint(0, len(marbles) - 1)]
        marble_picks[marble] += 1

    logging.info("Marble picks : " + str(marble_picks))

    # prepare the data for plotting
    keys = []
    data = []
    for key, value in marble_picks.iteritems():
        column_name = key + " (" + str(value) + ")"
        keys.extend([column_name])
        data.extend([value / float(num_trials)])

    description_list = []
    for jar_name, jar_marbles in jars.iteritems():
        description_list.append(jar_name + "(" + str(len(jar_marbles)) + ")")
    description = ", ".join(description_list)

    # plot the data
    charting.bar_chart(
        "marbles.png", [data],
        "Marbles in Jars (" + str(num_trials) + ") - " + description, keys,
        "Probabilities", None, ['#59799e'])