help='Full module path to a corpus reader class, such as %(default)s')
corpus_group.add_argument('--word-tokenizer', default='', help='Word Tokenizer class path')
corpus_group.add_argument('--sent-tokenizer', default='', help='Sent Tokenizer data.pickle path')
corpus_group.add_argument('--para-block-reader', default='', help='Block reader function path')

args = parser.parse_args()

###################
## corpus reader ##
###################

reader_args = []
reader_kwargs = {}

if args.word_tokenizer:
	reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)()

if args.sent_tokenizer:
	reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer)

if args.para_block_reader:
	reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader)

if args.trace:
	print 'loading %s' % args.source_corpus

input_corpus = load_corpus_reader(args.source_corpus, args.reader,
	*reader_args, **reader_kwargs)

#################
## translation ##
예제 #2
0
if args.cat_file:
    reader_kwargs['cat_file'] = args.cat_file

    if args.delimiter and args.delimiter != ' ':
        reader_kwargs['delimiter'] = args.delimiter

    if args.cat_pattern:
        reader_args.append(args.cat_pattern)
    else:
        reader_args.append('.+/.+')
elif args.cat_pattern:
    reader_args.append(args.cat_pattern)
    reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern)

if args.word_tokenizer:
    reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)()

if args.sent_tokenizer:
    reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer)

if args.para_block_reader:
    reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader)

if args.trace:
    print('loading %s' % args.corpus)

categorized_corpus = load_corpus_reader(args.corpus, args.reader, *reader_args,
                                        **reader_kwargs)

if not hasattr(categorized_corpus, 'categories'):
    raise ValueError('%s is does not have categories for classification')
)
corpus_group.add_argument("--word-tokenizer", default="", help="Word Tokenizer class path")
corpus_group.add_argument("--sent-tokenizer", default="", help="Sent Tokenizer data.pickle path")
corpus_group.add_argument("--para-block-reader", default="", help="Block reader function path")

args = parser.parse_args()

###################
## corpus reader ##
###################

reader_args = []
reader_kwargs = {}

if args.word_tokenizer:
    reader_kwargs["word_tokenizer"] = import_attr(args.word_tokenizer)()

if args.sent_tokenizer:
    reader_kwargs["sent_tokenizer"] = nltk.data.LazyLoader(args.sent_tokenizer)

if args.para_block_reader:
    reader_kwargs["para_block_reader"] = import_attr(args.para_block_reader)

if args.trace:
    print "loading %s" % args.source_corpus

input_corpus = load_corpus_reader(args.source_corpus, args.reader, *reader_args, **reader_kwargs)

#################
## translation ##
#################