reader_args = [] reader_kwargs = {} if args.word_tokenizer: reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)() if args.sent_tokenizer: reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer) if args.para_block_reader: reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader) if args.trace: print 'loading %s' % args.source_corpus input_corpus = load_corpus_reader(args.source_corpus, args.reader, *reader_args, **reader_kwargs) ################# ## translation ## ################# for fileid in input_corpus.fileids(): # TODO: use ~/nltk_data/corpora as dir prefix? path = os.path.join(args.target_corpus, fileid) dirname = os.path.dirname(path) if not os.path.exists(dirname): if args.trace: print 'making directory %s' % dirname os.makedirs(dirname)
reader_args.append(args.cat_pattern) reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern) if args.word_tokenizer: reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)() if args.sent_tokenizer: reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer) if args.para_block_reader: reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader) if args.trace: print 'loading %s' % args.corpus categorized_corpus = load_corpus_reader(args.corpus, args.reader, *reader_args, **reader_kwargs) if not hasattr(categorized_corpus, 'categories'): raise ValueError('%s is does not have categories for classification') labels = categorized_corpus.categories() nlabels = len(labels) if args.trace: print '%d labels: %s' % (nlabels, labels) if not nlabels: raise ValueError('corpus does not have any categories') elif nlabels == 1: raise ValueError('corpus must have more than 1 category') elif nlabels == 2 and args.multi:
multi-class classifier, the rest will be used for evaulation. The default is to use the entire corpus, and to test the classifier against the same training data. Any number < 1 will test against the remaining fraction.""", ) args = parser.parse_args() ################### ## corpus reader ## ################### if args.trace: print "loading corpus %s" % args.corpus corpus = load_corpus_reader(args.corpus) methods = { "sents": nltk_trainer.classification.corpus.category_sent_strings, "paras": nltk_trainer.classification.corpus.category_para_strings, "files": nltk_trainer.classification.corpus.category_file_strings, } cat_instances = methods[args.instances](corpus) ################ ## CSV output ## ################ filename = args.filename
corpus_group = parser.add_argument_group('Corpus Reader Options') corpus_group.add_argument('--reader', default=None, help='''Full module path to a corpus reader class, such as nltk.corpus.reader.chunked.ChunkedCorpusReader''') corpus_group.add_argument('--fileids', default=None, help='Specify fileids to load from corpus') corpus_group.add_argument('--fraction', default=1.0, type=float, help='''The fraction of the corpus to use for testing coverage''') args = parser.parse_args() ################### ## corpus reader ## ################### corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids) if args.score and not hasattr(corpus, 'chunked_sents'): raise ValueError('%s does not support scoring' % args.corpus) ############ ## tagger ## ############ if args.trace: print 'loading tagger %s' % args.tagger tagger = nltk.data.load(args.tagger) if args.trace: print 'loading chunker %s' % args.chunker
help='Full module path to a corpus reader class, defaults to %(default)s.') corpus_group.add_argument('--fileids', default=None, help='Specify fileids to load from corpus') corpus_group.add_argument('--sent-tokenizer', default='tokenizers/punkt/english.pickle', help='Path to pickled sentence tokenizer') corpus_group.add_argument('--word-tokenizer', default='nltk.tokenize.WordPunctTokenizer', help='Full module path to a tokenizer class, defaults to %(default)s.') args = parser.parse_args() ################### ## corpus reader ## ################### source_corpus = load_corpus_reader(args.source_corpus, reader=args.reader, fileids=args.fileids, encoding='utf-8', sent_tokenizer=args.sent_tokenizer, word_tokenizer=args.word_tokenizer) if not source_corpus: raise ValueError('%s is an unknown corpus') if args.trace: print 'loaded %s' % args.source_corpus ############ ## tagger ## ############ # TODO: from analyze_tagger_coverage.py if args.trace: print 'loading tagger %s' % args.tagger