def do_stats(num_sents, benchmarkincr=.05, status=1): """Generates a lot of sentences, and displays statistical info num_sents: number of sentences to run the analysis on benchmarkincr: for progress indicator status: boolean, whether or not to show the progress indicator """ global length total_breaks = 0 total_words = 0 total_nobreaks = 0 lastbenchmark = 0.0 for i in xrange(num_sents): if status: if 1.0 * i / num_sents > lastbenchmark + benchmarkincr: print "%d%% done, %d sentences analyzed" %(100.0 * i / num_sents, i) lastbenchmark += benchmarkincr sent = list(rsg.random_sentence(data, length))[:-1] num_breaks = num_cbreaks(sent) if num_breaks == 0: total_nobreaks += 1 total_breaks += num_breaks total_words += len(sent) avg_words_per_sent = total_words * 1.0 / num_sents avg_breaks_per_sent = total_breaks * 1.0 / num_sents breaks_per_word = total_breaks * 1.0 / total_words perc_total_nobreaks = total_nobreaks *1.0 / num_sents print "------------------- Results -----------------------" allvars = locals(); allvars.update(globals()) print """ length=%(length)s num_sents=%(num_sents)s perc_total_nobreaks=%(perc_total_nobreaks)s #Straight-copied sentences; indicator of sparseness avg_words_per_sent=%(avg_words_per_sent)s avg_breaks_per_sent=%(avg_breaks_per_sent)s breaks_per_word=%(breaks_per_word)s """ % allvars
breaks = 0 for i in range(0,length): end_of_ngram = sent[i] word, posls = end_of_ngram print "%-25s: n/a" %word for i in range(length, len(sent)): end_of_prev_ngram = sent[i-1] word,posls = end_of_prev_ngram prev_absolute_wordpositions = [pos[2] for pos in posls] end_of_ngram = sent[i] word, posls = end_of_ngram cur_absolute_wordpositions = [pos[2] for pos in posls] for cur_absolute_wordpos in cur_absolute_wordpositions: if cur_absolute_wordpos - 1 in prev_absolute_wordpositions: print "%-25s: continuous.." %word break #No continuity break! else: print "%-25s: Continuity break over the n-1-gram: %s " \ %(word, words[i-(length-1):i]) elif opt=='': print '-'*60 sent = list(rsg.random_sentence(data, length))[:-1] print ' '.join([itm[0] for itm in sent]) else: #read-eval-print loop by default print eval(opt)