Пример #1
0
    def __init__(
        self,
        root,
        fileids,
        sep="/",
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding="latin1",
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
Пример #2
0
    def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
                 target_language=None, paragraph_separator='\n\n', **kwargs):
        """
        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param target_language: which files to select; sometimes a corpus contains English
         translations, we expect these files to be named ...english.json -- if not, pass in fileids
        :param paragraph_separator: character sequence demarcating paragraph separation
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """

        if not target_language:
            target_language = ''
        if not fileids:
            fileids = r'.*{}\.json'.format(target_language)

        # Initialize the NLTK corpus reader objects
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
        self.paragraph_separator = paragraph_separator
Пример #3
0
Файл: knbc.py Проект: DrDub/nltk
 def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
     """
     Initialize KNBCorpusReader
     morphs2str is a function to convert morphlist to str for tree representation
     for _parse()
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self.morphs2str = morphs2str
Пример #4
0
 def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
     """
     Initialize the corpus reader.  Categorization arguments
     (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
     the ``CategorizedCorpusReader`` constructor.  The remaining
     arguments are passed to the ``CorpusReader`` constructor.
     """
     CorpusReader.__init__(self, root, fileids)
Пример #5
0
 def __init__(self, root, fileids, tone, tag, wrap_etree=False):
     self.fileids = fileids
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
     self.tagged_sents = []
     self.sents = []
     self.words = []
     self.tagged_words = []
     self.option_tone = tone
     self.option_tag = tag
Пример #6
0
 def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
     """
     Initialize KNBCorpusReader
     morphs2str is a function to convert morphlist to str for tree representation
     for _parse()
     """
     # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
     #       from CorpusReader?
     CorpusReader.__init__(self, root, fileids, encoding)
     self.morphs2str = morphs2str
 def __init__(self, root, fileids,
              syntax_parser=CaboChaParser(),
              word_tokenizer=MeCabTokenizer(),
              sent_tokenizer=jp_sent_tokenizer,
              case_parser=KNPParser(),
              encoding='utf-8'):
   CorpusReader.__init__(self, root, fileids, encoding)
   self._syntax_parser = syntax_parser
   self._word_tokenizer = word_tokenizer
   self._sent_tokenizer = sent_tokenizer
   self._case_parser = case_parser
Пример #8
0
 def __init__(self, root, zipfile, fileids):
     if isinstance(root, basestring):
         root = FileSystemPathPointer(root)
     elif not isinstance(root, PathPointer): 
         raise TypeError('CorpusReader: expected a string or a PathPointer')
     
     # convert to a ZipFilePathPointer
     root = ZipFilePathPointer(root.join(zipfile))
     
     CorpusReader.__init__(self, root, fileids)
     
     self._parse_char_replacements()
Пример #9
0
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
Пример #10
0
 def __init__(self, root, fileids, 
              sep='/', word_tokenizer=WhitespaceTokenizer(),
              sent_tokenizer=RegexpTokenizer('\n', gaps=True),
              encoding=None):
     """
     @param root: The root directory for this corpus.
     @param fileids: A list or regexp specifying the fileids in this corpus.
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self._sep = sep
     self._word_tokenizer = word_tokenizer
     self._sent_tokenizer = sent_tokenizer
     self._alignedsent_block_reader=None,
     self._alignedsent_block_reader = self._alignedsent_block_reader
     self._alignedsent_corpus_view = None
Пример #11
0
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle')
Пример #12
0
    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',
                 tags=TAGS, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        # Save the tags that we specifically want to extract.
        self.tags = tags
Пример #13
0
def assemble_corpus(corpus_reader: CorpusReader,
                    types_requested: List[str],
                    type_dirs: Dict[str, List[str]] = None,
                    type_files: Dict[str, List[str]] = None) -> CorpusReader:
    """
    Create a filtered corpus.
    :param corpus_reader: This get mutated
    :param types_requested: a list of string types, which are to be found in the type_dirs and
    type_files mappings
    :param type_dirs: a dict of corpus types to directories
    :param type_files: a dict of corpus types to files
    :return: a CorpusReader object containing only the mappings desired
    """
    fileid_names = []  # type: List[str]
    try:
        all_file_ids = list(corpus_reader.fileids())
        clean_ids_types = []  # type: List[Tuple[str, str]]
        if type_files:
            for key, valuelist in type_files.items():
                if key in types_requested:
                    for value in valuelist:
                        if value in all_file_ids:
                            if key:
                                clean_ids_types.append((value, key))
        if type_dirs:
            for key, valuelist in type_dirs.items():
                if key in types_requested:
                    for value in valuelist:
                        corrected_dir = value.replace('./', '')
                        corrected_dir = '{}/'.format(corrected_dir)
                        for name in all_file_ids:
                            if name and name.startswith(corrected_dir):
                                clean_ids_types.append((name, key))
        clean_ids_types.sort(key=lambda x: x[0])
        fileid_names, categories = zip(*clean_ids_types)  # type: ignore
        corpus_reader._fileids = fileid_names
        return corpus_reader
    except Exception:
        LOG.exception('failure in corpus building')
Пример #14
0
    def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
                 **kwargs):
        """
        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """
        if not fileids:
            fileids = r'.*\.txt'

        # Initialize the NLTK corpus reader objects
        PlaintextCorpusReader.__init__(self, root, fileids, encoding)
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
Пример #15
0
    def __init__(self, root, fileids=None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer
    def __init__(self,
                 root,
                 fileids,
                 column_types=None,
                 top_node='S',
                 beginning_of_sentence=r'#BOS.+$',
                 end_of_sentence=r'#EOS.+$',
                 encoding=None):
        """ Construct a new corpus reader for reading NEGRA corpus files.
        @param root: The root directory of the corpus files.
        @param fileids: A list of or regex specifying the files to read from.
        @param column_types: An optional C{list} of columns in the corpus.
        @param top_node: The top node of parsed sentence trees.
        @param beginning_of_sentence: A regex specifying the start of a sentence
        @param end_of_sentence: A regex specifying the end of a sentence
        @param encoding: The default corpus file encoding.
        """

        # Make sure there are no invalid column type
        if isinstance(column_types, list):
            for column_type in column_types:
                if column_type not in self.COLUMN_TYPES:
                    raise ValueError("Column %r is not supported." % columntype)
        else:
            column_types = self.COLUMN_TYPES

        # Define stuff
        self._top_node = top_node
        self._column_types = column_types
        self._fileids = fileids
        self._bos = beginning_of_sentence
        self._eos = end_of_sentence
        self._colmap = dict((c,i) for (i,c) in enumerate(column_types))

        # Finish constructing by calling the extended class' constructor
        CorpusReader.__init__(self, root, fileids, encoding)
Пример #17
0
 def fileids(self, channels=None, domains=None, categories=None):
     if channels is not None and domains is not None and \
             categories is not None:
         raise ValueError('You can specify only one of channels, domains '
                          'and categories parameter at once')
     if channels is None and domains is None and \
             categories is None:
         return CorpusReader.fileids(self)
     if isinstance(channels, basestring):
         channels = [channels]
     if isinstance(domains, basestring):
         domains = [domains]
     if isinstance(categories, basestring):
         categories = [categories]
     if channels:
         return self._list_morph_files_by('channel', channels)
     elif domains:
         return self._list_morph_files_by('domain', domains)
     else:
         return self._list_morph_files_by('keyTerm', categories,
                 map=self._map_category)
Пример #18
0
 def __init__(self, root, fileids):
     CorpusReader.__init__(self, root, fileids, None, None)
Пример #19
0
 def __init__(self, root, fileids, wrap_etree=False):
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
Пример #20
0
 def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
     """
     Initialize the corpus reader
     """
     CorpusReader.__init__(self, root, fileids, **kwargs)
Пример #21
0
 def __init__(self, root, fileids):
     CorpusReader.__init__(self, root, fileids, None, None)
Пример #22
0
 def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
     """
     Initialize the corpus reader
     """
     CorpusReader.__init__(self, root, fileids, **kwargs)
Пример #23
0
 def __init__(self, root, fileids, wrap_etree=False):
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
Пример #24
0
 def __init__(self, fileids=r'.*\.gz', data_folder=''):
     _root = os.path.join(susx._sussex_root, data_folder)
     CorpusReader.__init__(self, _root, fileids)
     self._n = None
     self._n_sents = None