def __init__(self, root, fileids=DOC_PATTERN, word_tokenizer=TweetTokenizer(), encoding='utf8', **kwargs): if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) TwitterCorpusReader.__init__(self, root, fileids, encoding) if isinstance(root, string_types) and not isinstance(root, PathPointer): m = re.match('(.*\.gz)/?(.*\.zip)/?(.*)$|', root) #'(.*\.zip)/?(.*\.gz)/?(.*)$|' gzipfile, zipfile, zipentry = m.groups() if zipfile: root = ZipFilePathPointer(zipfile, zipentry) elif gzipfile: root = ZipFilePathPointer(gzipfile, zipentry) else: root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') self._root = root self.current_doc = []
def __load(self): # Find the corpus root directory. zip_location = os.path.join(ROOT_PATH, 'plugins', 'crawl', 'wordnet', 'wordnet.zip') root = ZipFilePathPointer(zip_location, 'wordnet/') # Load the corpus. corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) # This is where the magic happens! Transform ourselves into # the corpus by modifying our own __dict__ and __class__ to # match that of the corpus. self.__dict__ = corpus.__dict__ self.__class__ = corpus.__class__
def create_text_corpus_from_zipfile( zf: ZipFile, pattern='.*\.txt', ensure_loaded=True) -> PlaintextCorpusReader: ''' Loads a text corpus contained in a zipfile. ''' pointer = ZipFilePathPointer(zf) corpus = PlaintextCorpusReader(pointer, pattern) if ensure_loaded: corpus.ensure_loaded() return corpus
def __init__(self, root, fileids, encoding='utf8', tagset=None): """ :type root: PathPointer or str :param root: A path pointer identifying the root directory for this corpus. If a string is specified, then it will be converted to a ``PathPointer`` automatically. :param fileids: A list of the files that make up this corpus. This list can either be specified explicitly, as a list of strings; or implicitly, as a regular expression over file paths. The absolute path for each file will be constructed by joining the reader's root to each file name. :param encoding: The default unicode encoding for the files that make up the corpus. The value of ``encoding`` can be any of the following: - A string: ``encoding`` is the encoding name for all files. - A dictionary: ``encoding[file_id]`` is the encoding name for the file whose identifier is ``file_id``. If ``file_id`` is not in ``encoding``, then the file contents will be processed using non-unicode byte strings. - A list: ``encoding`` should be a list of ``(regexp, encoding)`` tuples. The encoding for a file whose identifier is ``file_id`` will be the ``encoding`` value for the first tuple whose ``regexp`` matches the ``file_id``. If no tuple's ``regexp`` matches the ``file_id``, the file contents will be processed using non-unicode byte strings. - None: the file contents of all files will be processed using non-unicode byte strings. :param tagset: The name of the tagset used by this corpus, to be used for normalizing or converting the POS tags returned by the tagged_...() methods. """ # Convert the root to a path pointer, if necessary. if isinstance(root, compat.string_types) and not isinstance(root, PathPointer): m = re.match('(.*\.zip)/?(.*)$|', root) zipfile, zipentry = m.groups() if zipfile: root = ZipFilePathPointer(zipfile, zipentry) else: root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') # If `fileids` is a regexp, then expand it. if isinstance(fileids, compat.string_types): fileids = find_corpus_fileids(root, fileids) self._fileids = fileids """A list of the relative paths for the fileids that make up this corpus.""" self._root = root """The root directory for this corpus.""" # If encoding was specified as a list of regexps, then convert # it to a dictionary. if isinstance(encoding, list): encoding_dict = {} for fileid in self._fileids: for x in encoding: (regexp, enc) = x if re.match(regexp, fileid): encoding_dict[fileid] = enc break encoding = encoding_dict self._encoding = encoding """The default unicode encoding for the fileids that make up this corpus. If ``encoding`` is None, then the file contents are processed using byte strings.""" self._tagset = tagset
# T(he original version of t)his code was written by Ulrich Germann (11/2010) ###################################################################### import nltk nltk.data.path[0:0] = ['/u/csc485h/include/a3/nltk'] # The following code provides access to the tagged NY Times corpus # nyt_big is the full corpus # nyt_mini a small subset for development from nltk.data import ZipFilePathPointer from nltk.corpus import TaggedCorpusReader nyt_zipped = ZipFilePathPointer('/u/csc485h/include/a3/nltk/corpora/nyt.zip','nyt/') nyt_big = TaggedCorpusReader(nyt_zipped,['2004-tagged.txt'],sep='/', encoding='latin2') nyt_mini = TaggedCorpusReader(nyt_zipped,['nytimes-mini.txt'],sep='/', encoding='latin2') # Finally, let's set up a default pattern for NP chunking # Setting up the NP chunker itself is left to the main script, to encourage # trying different variants of the pattern ## Operator Behavior ## . Wildcard, matches any character ## ^abc Matches some pattern abc at the start of a string ## abc$ Matches some pattern abc at the end of a string ## [abc] Matches one of a set of characters ## [A-Z0-9] Matches one of a range of characters ## ed|ing|s Matches one of the specified strings (disjunction) ## * Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure)
resource_name = normalize_resource_name(resource_name, True) # Resolve default paths at runtime in-case the user overrides # vikinlp.data.pathevanlp paths is None: paths = path # Check if the resource name includes a zipfile name m = re.match(r'(.*\.zip)/?(.*)$|', resource_name) zipfile, zipentry = m.groups() # Check each item in our path for path_ in paths: # Is the path item a zipfile? if path_ and (os.path.isfile(path_) and path_.endswith('.zip')): try: return ZipFilePathPointer(path_, resource_name) except IOError: # resource not in zipfile continue # Is the path item a directory or is resource_name an absolute path? elif not path_ or os.path.isdir(path_): if zipfile is None: p = os.path.join(path_, url2pathname(resource_name)) if os.path.exists(p): if p.endswith('.gz'): return GzipFileSystemPathPointer(p) else: return FileSystemPathPointer(p) else: p = os.path.join(path_, url2pathname(zipfile))
def __init__(self, root, fileids, encoding=None, tag_mapping_function=None): """ @type root: L{PathPointer} or C{str} @param root: A path pointer identifying the root directory for this corpus. If a string is specified, then it will be converted to a L{PathPointer} automatically. @param fileids: A list of the files that make up this corpus. This list can either be specified explicitly, as a list of strings; or implicitly, as a regular expression over file paths. The absolute path for each file will be constructed by joining the reader's root to each file name. @param encoding: The default unicode encoding for the files that make up the corpus. C{encoding}'s value can be any of the following: - B{A string}: C{encoding} is the encoding name for all files. - B{A dictionary}: C{encoding[file_id]} is the encoding name for the file whose identifier is C{file_id}. If C{file_id} is not in C{encoding}, then the file contents will be processed using non-unicode byte strings. - B{A list}: C{encoding} should be a list of C{(regexp, encoding)} tuples. The encoding for a file whose identifier is C{file_id} will be the C{encoding} value for the first tuple whose C{regexp} matches the C{file_id}. If no tuple's C{regexp} matches the C{file_id}, the file contents will be processed using non-unicode byte strings. - C{None}: the file contents of all files will be processed using non-unicode byte strings. @param tag_mapping_function: A function for normalizing or simplifying the POS tags returned by the tagged_words() or tagged_sents() methods. """ # Convert the root to a path pointer, if necessary. if isinstance(root, basestring): m = re.match('(.*\.zip)/?(.*)$|', root) zipfile, zipentry = m.groups() if zipfile: root = ZipFilePathPointer(zipfile, zipentry) else: root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') # If `fileids` is a regexp, then expand it. if isinstance(fileids, basestring): fileids = find_corpus_fileids(root, fileids) self._fileids = fileids """A list of the relative paths for the fileids that make up this corpus.""" self._root = root """The root directory for this corpus.""" # If encoding was specified as a list of regexps, then convert # it to a dictionary. if isinstance(encoding, list): encoding_dict = {} for fileid in self._fileids: for x in encoding: (regexp, enc) = x if re.match(regexp, fileid): encoding_dict[fileid] = enc break encoding = encoding_dict self._encoding = encoding """The default unicode encoding for the fileids that make up this corpus. If C{encoding} is C{None}, then the file contents are processed using byte strings (C{str}).""" self._tag_mapping_function = tag_mapping_function