示例#1
0
    def __init__(self,
                 root,
                 fileids=None,
                 encoding='utf8',
                 skip_keywords=None,
                 **kwargs):
        """

        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """
        if not fileids:
            fileids = r'.*\.txt'

        # Initialize the NLTK corpus reader objects
        PlaintextCorpusReader.__init__(self, root, fileids, encoding)
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
示例#2
0
 def __init__(self, sep="/", 
              # Note that . needs to be escaped
              pattern = chinese_pattern,
              root=None, fileids=None):
     """docstring for __init__"""
     PlaintextCorpusReader.__init__(
         self,
         sep=sep, root=root, fileids=fileids,
         sent_tokenizer = RegexpTokenizer(pattern, gaps=True),
         encoding="utf-8")
示例#3
0
 def __init__(
         self,
         sep="/",
         # Note that . needs to be escaped
         pattern=chinese_pattern,
         root=None,
         fileids=None):
     """docstring for __init__"""
     PlaintextCorpusReader.__init__(self,
                                    sep=sep,
                                    root=root,
                                    fileids=fileids,
                                    sent_tokenizer=RegexpTokenizer(
                                        pattern, gaps=True),
                                    encoding="utf-8")
示例#4
0
    def __init__(self,
                 root,
                 fields=DOC_PATTERN,
                 sent_pattern=SENT_PATTERN,
                 encoding='utf8',
                 **kargs):
        """
        :param root: corpusが入っているdir
        :param fields: 対象となるcorpus
        :param encoding:
        """

        PlaintextCorpusReader.__init__(
            self,
            root,
            fields,
            word_tokenizer=JanomeTokenizer(),
            sent_tokenizer=RegexpTokenizer(sent_pattern),
            encoding=encoding)
示例#5
0
 def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
              **kwargs):
     """
     :param root: The file root of the corpus directory
     :param fileids: the list of file ids to consider, or wildcard expression
     :param skip_keywords: a list of words which indicate whole paragraphs that should
     be skipped by the paras and words methods()
     :param encoding: utf8
     :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
     word_tokenizer.
     """
     # Initialize the NLTK corpus reader objects
     PlaintextCorpusReader.__init__(self, root, fileids, encoding)
     # CorpusReader.__init__(self, root, fileids, encoding)
     if 'sent_tokenizer' in kwargs:
         self._sent_tokenizer = kwargs['sent_tokenizer']
     if 'word_tokenizer' in kwargs:
         self._word_tokenizer = kwargs['word_tokenizer']
     if 'pos_tagger' in kwargs:
         self.pos_tagger = kwargs['pos_tagger']