#!/usr/bin/env python # -*- coding: utf-8 from nltk.probability import ConditionalFreqDist from nltk.corpus import TaggedCorpusReader from nltk.tag import simplify FIRST = 0 END = 150 POS = "V" #POS = "N" #POS = "ADJ" corpus_root = './data' fileids = 'tagged_sent' corpus = TaggedCorpusReader(corpus_root, fileids, encoding='utf-8') processing = [(simplify.simplify_wsj_tag(tag), word.lower()) for (word, tag) in corpus.tagged_words()] cfd_corpus = ConditionalFreqDist(processing) for term,freq in cfd_corpus[POS].items(): print term.encode("utf-8"),freq
from gmail_corpus.nltk_util.bigram_score import make_score_dict, save_score_dict from nltk.corpus import TaggedCorpusReader import numpy as np from glob import glob import os, sys if __name__ == '__main__': corpus_path = sys.argv[1] # remove empty files files = glob('%s/*.txt' % corpus_path) for f in files: if os.path.getsize(f) == 0: os.remove(f) print 'Removed empty file %s' % f corpus = TaggedCorpusReader(corpus_path, '.*\.txt') score_dict = make_score_dict(corpus.tagged_words()) save_score_dict(score_dict, 'bigram_scores.pkl') print 'saved bigram_scores.pkl'