def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None, **kwargs): """ :param root: The file root of the corpus directory :param fileids: the list of file ids to consider, or wildcard expression :param skip_keywords: a list of words which indicate whole paragraphs that should be skipped by the paras and words methods() :param encoding: utf8 :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer, word_tokenizer. """ if not fileids: fileids = r'.*\.txt' # Initialize the NLTK corpus reader objects PlaintextCorpusReader.__init__(self, root, fileids, encoding) CorpusReader.__init__(self, root, fileids, encoding) if 'sent_tokenizer' in kwargs: self._sent_tokenizer = kwargs['sent_tokenizer'] if 'word_tokenizer' in kwargs: self._word_tokenizer = kwargs['word_tokenizer'] self.skip_keywords = skip_keywords
def __init__(self, sep="/", # Note that . needs to be escaped pattern = chinese_pattern, root=None, fileids=None): """docstring for __init__""" PlaintextCorpusReader.__init__( self, sep=sep, root=root, fileids=fileids, sent_tokenizer = RegexpTokenizer(pattern, gaps=True), encoding="utf-8")
def __init__( self, sep="/", # Note that . needs to be escaped pattern=chinese_pattern, root=None, fileids=None): """docstring for __init__""" PlaintextCorpusReader.__init__(self, sep=sep, root=root, fileids=fileids, sent_tokenizer=RegexpTokenizer( pattern, gaps=True), encoding="utf-8")
def __init__(self, root, fields=DOC_PATTERN, sent_pattern=SENT_PATTERN, encoding='utf8', **kargs): """ :param root: corpusが入っているdir :param fields: 対象となるcorpus :param encoding: """ PlaintextCorpusReader.__init__( self, root, fields, word_tokenizer=JanomeTokenizer(), sent_tokenizer=RegexpTokenizer(sent_pattern), encoding=encoding)
def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None, **kwargs): """ :param root: The file root of the corpus directory :param fileids: the list of file ids to consider, or wildcard expression :param skip_keywords: a list of words which indicate whole paragraphs that should be skipped by the paras and words methods() :param encoding: utf8 :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer, word_tokenizer. """ # Initialize the NLTK corpus reader objects PlaintextCorpusReader.__init__(self, root, fileids, encoding) # CorpusReader.__init__(self, root, fileids, encoding) if 'sent_tokenizer' in kwargs: self._sent_tokenizer = kwargs['sent_tokenizer'] if 'word_tokenizer' in kwargs: self._word_tokenizer = kwargs['word_tokenizer'] if 'pos_tagger' in kwargs: self.pos_tagger = kwargs['pos_tagger']