def generate_coin_flip_distribution_offset(max_number_of_flips, flip_count_multiplier=1.1): flip_counts = [] head_percentages = [] number_of_flips = 2 while number_of_flips < max_number_of_flips: logging.info("Generating " + str(number_of_flips) + " coin flips") # Flip the coin over and over and report back the number of heads # so we can then determine the ratio of heads number_of_heads = flip_a_coin(number_of_flips) ratio_of_heads = float(number_of_heads) / number_of_flips flip_counts.extend([number_of_flips]) # Whatever number we get, unless it was exactly .5, it was off from the ideal. Record # that offset from the expected so we can plot it. error_from_expected = abs(.5 - ratio_of_heads) head_percentages.extend([error_from_expected]) # It would take forever to walk from 1 to a million, but it's not too bad if # we multiply the number of coin flip trials each time instead of adding. number_of_flips = int(number_of_flips * flip_count_multiplier) + 1 # output a text variation of the generated percentages logging.debug(str(flip_counts)) logging.debug(str(head_percentages)) # we don't have room to display all number labels, so eliminate all but 8 x_label_step_size = len(flip_counts) / 8 for i in range(0, len(flip_counts)): if i % x_label_step_size: flip_counts[i] = "" # now generate a plot charting.bar_chart("coin_flip.png", [head_percentages], "Heads Flips - Offset from Ideal (" + str(max_number_of_flips) + ")", flip_counts, "Offset from .5 - Larger is Worse", None, ['#59799e'], 0, 0, False, .5, "none")
def compare_stemming_to_lemmatization(): # load each of the corpora abc_words = nltk.corpus.abc.words() genesis_words = nltk.corpus.genesis.words() gutenberg_words = nltk.corpus.gutenberg.words() inaugural_words = nltk.corpus.inaugural.words() state_union_words = nltk.corpus.state_union.words() webtext_words = nltk.corpus.webtext.words() all_words = [ abc_words, genesis_words, gutenberg_words, inaugural_words, state_union_words, webtext_words ] corpora_names = [ "ABC", "Genesis", "Gutenberg", "Inaugural", "Union", "Web" ] word_counts = [] lemmatized_counts = [] stemmed_counts = [] # iterate through each corpus and generate counts of the unique tokens # in each for index, words in enumerate(all_words): logging.debug("Lemmatizing " + corpora_names[index]) lemmatized = collect_term_counts(lemmatize_words_array(words)) logging.debug("Stemming " + corpora_names[index]) stemmed = collect_term_counts(stem_words_array(words)) word_counts.extend([len(collect_term_counts(words))]) lemmatized_counts.extend([len(lemmatized)]) stemmed_counts.extend([len(stemmed)]) logging.info("Corpora: " + str(corpora_names)) logging.info("Word Counts: " + str(word_counts)) logging.info("Lemmatized Word Counts: " + str(lemmatized_counts)) logging.info("Stemmed Word Counts: " + str(stemmed_counts)) # output a bar chart illustrating the above charting.bar_chart("stemming_vs_lemmatization.png", [word_counts, lemmatized_counts, stemmed_counts], "Token Counts for Words, Stems and Lemmas", corpora_names, "Token Counts", ["Words", "Stems", "Lemmas"], ['#59799e', '#810CE8', '#FF0000'], .5)
def compare_stemming_to_lemmatization(): # load each of the corpora abc_words = nltk.corpus.abc.words() genesis_words = nltk.corpus.genesis.words() gutenberg_words = nltk.corpus.gutenberg.words() inaugural_words = nltk.corpus.inaugural.words() state_union_words = nltk.corpus.state_union.words() webtext_words = nltk.corpus.webtext.words() all_words = [abc_words, genesis_words, gutenberg_words, inaugural_words, state_union_words, webtext_words] corpora_names = ["ABC", "Genesis", "Gutenberg", "Inaugural", "Union", "Web"] word_counts = [] lemmatized_counts = [] stemmed_counts = [] # iterate through each corpus and generate counts of the unique tokens # in each for index, words in enumerate(all_words): logging.debug("Lemmatizing " + corpora_names[index]) lemmatized = collect_term_counts(lemmatize_words_array(words)) logging.debug("Stemming " + corpora_names[index]) stemmed = collect_term_counts(stem_words_array(words)) word_counts.extend([len(collect_term_counts(words))]) lemmatized_counts.extend([len(lemmatized)]) stemmed_counts.extend([len(stemmed)]) logging.info("Corpora: " + str(corpora_names)) logging.info("Word Counts: " + str(word_counts)) logging.info("Lemmatized Word Counts: " + str(lemmatized_counts)) logging.info("Stemmed Word Counts: " + str(stemmed_counts)) # output a bar chart illustrating the above charting.bar_chart( "stemming_vs_lemmatization.png", [word_counts, lemmatized_counts, stemmed_counts], "Token Counts for Words, Stems and Lemmas", corpora_names, "Token Counts", ["Words", "Lemmas", "Stems"], ["#59799e", "#810CE8", "#FF0000"], 0.5, )
def chart_term_frequencies(file_name, title, y_axis, term_frequencies, indexes=numpy.arange(5)): chart_terms = [] chart_frequencies = [] selected_frequencies = [] for index in indexes: selected_frequencies.append(term_frequencies[index]) for term, frequency in selected_frequencies: chart_terms.extend([term]) chart_frequencies.append([frequency]) charting.bar_chart( file_name, chart_frequencies, title, None, y_axis, chart_terms, ['#59799e', '#810CE8', '#FF0000', '#12995D', '#FD53FF', '#AA55CC'], 1, 0.2)
def collect_and_output_frequency_frequencies(corpus, corpus_name, term_frequencies): if term_frequencies is None: term_frequencies = collect_term_counts(corpus) frequency_frequencies = {} for term, frequency in term_frequencies.iteritems(): if frequency_frequencies.has_key(frequency): frequency_frequencies[frequency] += 1 else: frequency_frequencies[frequency] = 1 unsorted_array = [[key, value] for key, value in frequency_frequencies.iteritems()] sorted_array = sorted( unsorted_array, key=lambda frequency_frequency: frequency_frequency[1], reverse=True) frequency_frequencies_to_chart = [] frequencies_to_chart = [] output_csv_file = open_csv_file("frequency_frequencies.csv", ["Frequency Frequency", "Term Frequency"]) # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional # array. Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart. # This means there is exactly 1 data set and 6 columns of data in the set. There is no second set to compare # it to. for index, (term_frequency, frequency_frequency) in enumerate(sorted_array): output_csv_file.writerow([frequency_frequency] + [term_frequency]) if index <= 20: frequencies_to_chart.extend([term_frequency]) frequency_frequencies_to_chart.extend([frequency_frequency]) charting.bar_chart( "frequency_frequencies.png", [frequency_frequencies_to_chart], "Frequency Frequencies (" + corpus_name + ")", frequencies_to_chart, "Frequency Frequency", None, ['#59799e', '#810CE8', '#FF0000', '#12995D', '#FD53FF', '#AA55CC'], 0.2, 0.0) return frequency_frequencies
def collect_and_output_frequency_frequencies(corpus, corpus_name, term_frequencies): if term_frequencies is None: term_frequencies = collect_term_counts(corpus) frequency_frequencies = {} for term, frequency in term_frequencies.iteritems(): if frequency_frequencies.has_key(frequency): frequency_frequencies[frequency] += 1 else: frequency_frequencies[frequency] = 1 unsorted_array = [[key, value] for key, value in frequency_frequencies.iteritems()] sorted_array = sorted(unsorted_array, key=lambda frequency_frequency: frequency_frequency[1], reverse=True) frequency_frequencies_to_chart = [] frequencies_to_chart = [] output_csv_file = fs.open_csv_file("frequency_frequencies.csv", ["Frequency Frequency", "Term Frequency"]) # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional # array. Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart. # This means there is exactly 1 data set and 6 columns of data in the set. There is no second set to compare # it to. for index, (term_frequency, frequency_frequency) in enumerate(sorted_array): output_csv_file.writerow([frequency_frequency] + [term_frequency]) if index <= 20: frequencies_to_chart.extend([term_frequency]) frequency_frequencies_to_chart.extend([frequency_frequency]) charting.bar_chart( "frequency_frequencies.png", [frequency_frequencies_to_chart], "Frequency Frequencies (" + corpus_name + ")", frequencies_to_chart, "Frequency Frequency", None, ["#59799e", "#810CE8", "#FF0000", "#12995D", "#FD53FF", "#AA55CC"], 0.2, 0.0, ) return frequency_frequencies
def generate_coin_flip_distribution_offset(max_number_of_flips, flip_count_multiplier=1.1): flip_counts = [] head_percentages = [] number_of_flips = 2 while number_of_flips < max_number_of_flips: logging.info("Generating " + str(number_of_flips) + " coin flips") # Flip the coin over and over and report back the number of heads # so we can then determine the ratio of heads number_of_heads = flip_a_coin(number_of_flips) ratio_of_heads = float(number_of_heads) / number_of_flips flip_counts.extend([number_of_flips]) # Whatever number we get, unless it was exactly .5, it was off from the ideal. Record # that offset from the expected so we can plot it. error_from_expected = abs(.5 - ratio_of_heads) head_percentages.extend([error_from_expected]) # It would take forever to walk from 1 to a million, but it's not too bad if # we multiply the number of coin flip trials each time instead of adding. number_of_flips = int(number_of_flips * flip_count_multiplier) + 1 # output a text variation of the generated percentages logging.debug(str(flip_counts)) logging.debug(str(head_percentages)) # we don't have room to display all number labels, so eliminate all but 8 x_label_step_size = len(flip_counts) / 8 for i in range(0, len(flip_counts)): if i % x_label_step_size: flip_counts[i] = "" # now generate a plot charting.bar_chart( "coin_flip.png", [head_percentages], "Heads Flips - Offset from Ideal (" + str(max_number_of_flips) + ")", flip_counts, "Offset from .5 - Larger is Worse", None, ['#59799e'], 0, 0, False, .5, "none")
def chart_term_frequencies(file_name, title, y_axis, term_frequencies, indexes=numpy.arange(5)): chart_terms = [] chart_frequencies = [] selected_frequencies = [] for index in indexes: selected_frequencies.append(term_frequencies[index]) for term, frequency in selected_frequencies: chart_terms.extend([term]) chart_frequencies.append([frequency]) charting.bar_chart( file_name, chart_frequencies, title, None, y_axis, chart_terms, ["#59799e", "#810CE8", "#FF0000", "#12995D", "#FD53FF", "#AA55CC"], 1, 0.2, )
def marbles_and_jars(num_trials): # read in the csv file of jars rows = fs.read_csv("marbles.csv") logging.debug("Read rows: " + str(rows)) jars = {} headers = [] marble_picks = {} # go through the rows and build a dictionary of jar_name => array of marble colors for index, row in enumerate(rows): # first row is just header data if index == 0: headers = row else: # go through each of the headers (these are columns) for column_index, header in enumerate(headers): # if the first column than it's the name of the jar - initialize the array to empty (no marbles) if column_index == 0: jars[row[0]] = [] else: # each other column represents a number of marbles, the name of the marble is in the header marble_color = header # initialize the counters for picking marbles for the given color marble_picks[marble_color] = 0 # set blank cells to 0, otherwise add the value in the cell if len(row[column_index]) == 0: num_marbles = 0 else: num_marbles = int(row[column_index]) # expand an array of colors, 1 element for each num_marbles jars[row[0]] += [marble_color] * num_marbles logging.info("Jars: " + str(jars)) for i in range(0, num_trials): # pick a random jar from all of the jars w/out taking the marbles into consideration jar_names = jars.keys() jar_name = jar_names[random.randint(0, len(jar_names) - 1)] # now draw a single marble from all the marbles given that we selected a jar marbles = jars[jar_name]; marble = marbles[random.randint(0, len(marbles) - 1)] marble_picks[marble] += 1 logging.info("Marble picks : " + str(marble_picks)) # prepare the data for plotting keys = [] data = [] for key, value in marble_picks.iteritems(): column_name = key + " (" + str(value) + ")" keys.extend([column_name]) data.extend([value/float(num_trials)]) description_list = [] for jar_name, jar_marbles in jars.iteritems(): description_list.append(jar_name + "(" + str(len(jar_marbles)) + ")") description = ", ".join(description_list) # plot the data charting.bar_chart("marbles.png", [data], "Marbles in Jars (" + str(num_trials) + ") - " + description, keys, "Probabilities", None, ['#59799e'])
def marbles_and_jars(num_trials): # read in the csv file of jars rows = fs.read_csv("marbles.csv") logging.debug("Read rows: " + str(rows)) jars = {} headers = [] marble_picks = {} # go through the rows and build a dictionary of jar_name => array of marble colors for index, row in enumerate(rows): # first row is just header data if index == 0: headers = row else: # go through each of the headers (these are columns) for column_index, header in enumerate(headers): # if the first column than it's the name of the jar - initialize the array to empty (no marbles) if column_index == 0: jars[row[0]] = [] else: # each other column represents a number of marbles, the name of the marble is in the header marble_color = header # initialize the counters for picking marbles for the given color marble_picks[marble_color] = 0 # set blank cells to 0, otherwise add the value in the cell if len(row[column_index]) == 0: num_marbles = 0 else: num_marbles = int(row[column_index]) # expand an array of colors, 1 element for each num_marbles jars[row[0]] += [marble_color] * num_marbles logging.info("Jars: " + str(jars)) for i in range(0, num_trials): # pick a random jar from all of the jars w/out taking the marbles into consideration jar_names = jars.keys() jar_name = jar_names[random.randint(0, len(jar_names) - 1)] # now draw a single marble from all the marbles given that we selected a jar marbles = jars[jar_name] marble = marbles[random.randint(0, len(marbles) - 1)] marble_picks[marble] += 1 logging.info("Marble picks : " + str(marble_picks)) # prepare the data for plotting keys = [] data = [] for key, value in marble_picks.iteritems(): column_name = key + " (" + str(value) + ")" keys.extend([column_name]) data.extend([value / float(num_trials)]) description_list = [] for jar_name, jar_marbles in jars.iteritems(): description_list.append(jar_name + "(" + str(len(jar_marbles)) + ")") description = ", ".join(description_list) # plot the data charting.bar_chart( "marbles.png", [data], "Marbles in Jars (" + str(num_trials) + ") - " + description, keys, "Probabilities", None, ['#59799e'])