def main(): for fname in f_names: corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed vocab = create_corpus.form_vocab(corpus[0:100000]) #number of unique words.. vocab is a dict corpus = create_corpus.space_strip(corpus[0:100000]) sort_vocab = sort(vocab) #sorted vocab in the form a list of elements [(key,freq),..] #freq_vocab = trim(sort_vocab,1) #write_vocab(freq_vocab,'freq_hist_1_'+fname) write_vocab(sort_vocab,'hist_'+fname) print fname+" vocab size : "+ str(len(vocab))+ " corpus size : "+str(len(corpus))
def main(): f_names = ['output_brown_religion.txt','output_coffee.txt'] for fname in f_names: corpus = create_corpus.load_corpus(fname[7:]) #load corpus with space replaced by underscores,lowercase and all punc removed i_vocab = create_corpus.form_vocab(corpus) #number of unique words.. vocab is a dict vocab = output_vocab(fname) sort_vocab = plot_histogram.sort(vocab) err_vocab = error(i_vocab,sort_vocab) plot_histogram.write_vocab(err_vocab,'hist_'+fname) print 'Average word length is ',avg_word_length(vocab)
<<<<<<< HEAD def main(): for fname in f_names: corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed vocab = create_corpus.form_vocab(corpus[0:100000]) #number of unique words.. vocab is a dict corpus = create_corpus.space_strip(corpus[0:100000]) sort_vocab = sort(vocab) #sorted vocab in the form a list of elements [(key,freq),..] #freq_vocab = trim(sort_vocab,1) #write_vocab(freq_vocab,'freq_hist_1_'+fname) write_vocab(sort_vocab,'hist_'+fname) print fname+" vocab size : "+ str(len(vocab))+ " corpus size : "+str(len(corpus)) #print "contribution of freq vocab in corpus size : "+ str(contribution(freq_vocab)) if __name__ == "__main__": main() ======= for fname in f_names: corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed vocab = create_corpus.form_vocab(corpus) #number of unique words.. vocab is a dict corpus = create_corpus.space_strip(corpus) sort_vocab = sort(vocab) #sorted vocab in the form a list of elements [(key,freq),..] freq_vocab = trim(sort_vocab,1) write_vocab(freq_vocab,'freq_hist_1_'+fname) write_vocab(sort_vocab,'hist_'+fname) print fname+" vocab size : "+ str(len(vocab))+" freq_vocab : "+ str(len(freq_vocab)) + " corpus size : "+str(len(corpus)) print "contribution of freq vocab in corpus size : "+ str(contribution(freq_vocab)) >>>>>>> ba02c4f692249cdf8bcbc2930b4c74b2499e2e75