예제 #1
0
def main():

    # Parsing user input
    parser = ap.ArgumentParser()
    parser.add_argument('-i',
                        '--input',
                        nargs='?',
                        type=str,
                        required=True,
                        help='Input filename.')
    parser.add_argument('-c',
                        '--concordance',
                        nargs='?',
                        type=str,
                        default=None,
                        help='Word concordance.')
    parser.add_argument('-d',
                        '--dispersion',
                        nargs='*',
                        type=str,
                        default=None,
                        help='Word dispersion.')
    parser.add_argument('-f',
                        '--frequency',
                        nargs='?',
                        type=int,
                        default=None,
                        help='Word frequency.')
    parser.add_argument('-a',
                        '--acro',
                        action='store_true',
                        help='Acronyms only.')
    args = parser.parse_args()

    with open(args.input, 'r') as f:
        plain = f.read()

    plain = remove_comments(plain)

    words = nltk.word_tokenize(plain)

    if args.acro:
        words = [w for w in words if is_acro(w)]

    print '%d unique words out of %d total words.' % (len(
        set(words)), len(words))

    text = nltk.Text(words)

    if args.concordance is not None:
        text.concordance(args.concordance)
        return

    if args.dispersion is not None:
        text.dispersion_plot(args.dispersion)
        return

    if args.frequency is not None:
        freq = FreqDist(text)
        for i, f in enumerate(freq.most_common(args.frequency)):
            print '%9d%9d %s' % (i, f[1], f[0])
        freq.plot(args.frequency)
예제 #2
0
print("reading corpus")
corpus_root = os.getcwd() + "/data/"
file_ids = ".*.txt"
corpus = PlaintextCorpusReader(corpus_root, file_ids)

print("building nltk text obj")
text = nltk.Text(corpus.words())
V = set(text)
c = Counter(corpus.words())

#use this option to run examples
if run_examples:
    print("calculating colocations & freqdist")
    print(text.collocations())
    f1 = FreqDist(text)
    f1.plot(50, cumulative=True)

    print("vocab exploration examples")
    print([w for w in V if len(w) > 15][:100])
    print(sorted([w for w in V if len(w) > 7 and f1[w] > 7]))

    print("common contexts and concordances")
    # quit harm addict help pain clean sober high
    print(text.common_contexts(['sober', 'high']))
    print(text.concordance('sober'))

    print("d/dx comment freq vs words")
    p = term_freq_analysis.multi_author_postfreqterm(
        'I ', [author_lists[a] for a in top_authors])
    p2 = term_freq_analysis.multi_author_postfreqterm(
        'sober', [author_lists[a] for a in top_authors])