print >> sys.stderr, '%s [stop word file] [output name] ' \ '[doc file] ...' % sys.argv[0] sys.exit(1) file_list = [] for _dir in sys.argv[3:]: collect_files(file_list, _dir) stop_word = Vocabulary() stop_word.load(sys.argv[1]) vocab = Vocabulary() articles = [] for filename in file_list: article = stem_file(filename, vocab, stop_word) articles.append(article) # random.shuffle(articles) vocab.sort() vocab.save(sys.argv[2] + '-vocab') fp = open(sys.argv[2] + '-train', 'w') for article in articles: sb = '' for word in article: sb += '%d ' % vocab.get_id_from_token(word) sb = sb.rstrip() fp.write(sb) fp.write('\n') fp.close()
count = word_count.get(stemmed_word, 0) + 1 word_count[stemmed_word] = count infile.close() return word_count if __name__ == '__main__': if len(sys.argv) <= 2: print >>sys.stderr, '%s [stop word file] [doc file] ...' % sys.argv[0] sys.exit(1) stop_word = Vocabulary() stop_word.load(sys.argv[1]) vocab = Vocabulary() word_count_list = [] for filename in sys.argv[2:]: word_count = stem_file(filename, vocab, stop_word) word_count_list.append(word_count) vocab.sort() vocab.save('train.vocab') fp = open('train', 'w') for word_count in word_count_list: for word in word_count.keys(): id = vocab.get_id_from_token(word) count = word_count[word] fp.write('%d:%d ' % (id, count)) fp.write('\n') fp.close()