def main(): # Parsing user input parser = ap.ArgumentParser() parser.add_argument('-i', '--input', nargs='?', type=str, required=True, help='Input filename.') parser.add_argument('-c', '--concordance', nargs='?', type=str, default=None, help='Word concordance.') parser.add_argument('-d', '--dispersion', nargs='*', type=str, default=None, help='Word dispersion.') parser.add_argument('-f', '--frequency', nargs='?', type=int, default=None, help='Word frequency.') parser.add_argument('-a', '--acro', action='store_true', help='Acronyms only.') args = parser.parse_args() with open(args.input, 'r') as f: plain = f.read() plain = remove_comments(plain) words = nltk.word_tokenize(plain) if args.acro: words = [w for w in words if is_acro(w)] print '%d unique words out of %d total words.' % (len( set(words)), len(words)) text = nltk.Text(words) if args.concordance is not None: text.concordance(args.concordance) return if args.dispersion is not None: text.dispersion_plot(args.dispersion) return if args.frequency is not None: freq = FreqDist(text) for i, f in enumerate(freq.most_common(args.frequency)): print '%9d%9d %s' % (i, f[1], f[0]) freq.plot(args.frequency)
print("reading corpus") corpus_root = os.getcwd() + "/data/" file_ids = ".*.txt" corpus = PlaintextCorpusReader(corpus_root, file_ids) print("building nltk text obj") text = nltk.Text(corpus.words()) V = set(text) c = Counter(corpus.words()) #use this option to run examples if run_examples: print("calculating colocations & freqdist") print(text.collocations()) f1 = FreqDist(text) f1.plot(50, cumulative=True) print("vocab exploration examples") print([w for w in V if len(w) > 15][:100]) print(sorted([w for w in V if len(w) > 7 and f1[w] > 7])) print("common contexts and concordances") # quit harm addict help pain clean sober high print(text.common_contexts(['sober', 'high'])) print(text.concordance('sober')) print("d/dx comment freq vs words") p = term_freq_analysis.multi_author_postfreqterm( 'I ', [author_lists[a] for a in top_authors]) p2 = term_freq_analysis.multi_author_postfreqterm( 'sober', [author_lists[a] for a in top_authors])