def __init__( self, root: Path = (Path(__file__).parent.resolve() / Path("../data/corpus/")), target: Path = (Path(__file__).parent.resolve() / Path("../data/corpus_processed/")), fileids: str = r".+\.html", encoding: str = "utf8", ) -> None: """Initialize the corpus reader. Keyword Arguments: root {Path} -- Path of corpus root. target {Path} -- Path of transformed corpus root. fileids {str} -- Regex pattern for documents. encoding {str} -- String encoding of corpus. """ CorpusReader.__init__(self, str(root), fileids, encoding) self.target = target self.html2text = HTML2Text() self.html2text.ignore_links = True self.html2text.ignore_images = True self.html2text.ignore_tables = True self.html2text.ignore_emphasis = True self.html2text.unicode_snob = True self.log = logging.getLogger("readability.readability") self.log.setLevel("WARNING")
def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None, target_language=None, paragraph_separator='\n\n', **kwargs): """ :param root: The file root of the corpus directory :param fileids: the list of file ids to consider, or wildcard expression :param skip_keywords: a list of words which indicate whole paragraphs that should be skipped by the paras and words methods() :param target_language: which files to select; sometimes a corpus contains English translations, we expect these files to be named ...english.json -- if not, pass in fileids :param paragraph_separator: character sequence demarcating paragraph separation :param encoding: utf8 :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer, word_tokenizer. """ if not target_language: target_language = '' if not fileids: fileids = r'.*{}\.json'.format(target_language) # Initialize the NLTK corpus reader objects CorpusReader.__init__(self, root, fileids, encoding) if 'sent_tokenizer' in kwargs: self._sent_tokenizer = kwargs['sent_tokenizer'] if 'word_tokenizer' in kwargs: self._word_tokenizer = kwargs['word_tokenizer'] self.skip_keywords = skip_keywords self.paragraph_separator = paragraph_separator
def __init__(self, root, fileids=DOC_PATTERN, tags=None, word_tokenizer=WordPunctTokenizer(), sent_tokenizer=nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle'), encoding='utf8', **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._good_tags = tags or self.TAGS
def __init__( self, root, fileids, sep="/", word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer("\n", gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding="latin1", ): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None, **kwargs): """ :param root: The file root of the corpus directory :param fileids: the list of file ids to consider, or wildcard expression :param skip_keywords: a list of words which indicate whole paragraphs that should be skipped by the paras and words methods() :param encoding: utf8 :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer, word_tokenizer. """ if not fileids: fileids = r'.*\.txt' # Initialize the NLTK corpus reader objects PlaintextCorpusReader.__init__(self, root, fileids, encoding) CorpusReader.__init__(self, root, fileids, encoding) if 'sent_tokenizer' in kwargs: self._sent_tokenizer = kwargs['sent_tokenizer'] if 'word_tokenizer' in kwargs: self._word_tokenizer = kwargs['word_tokenizer'] self.skip_keywords = skip_keywords
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids)
def __init__( self, root, fileids=DOC_PATTERN, encoding='utf8', **kwargs ): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): # First, try to build a cat_map from standard-style filenames try: kwargs['cat_map'] = make_cat_map(root, 'txt') # On error, fall back to dir names for categories except Exception as e: print(type(e), e, "\nUnable to build category map from file names.\nFalling back to categories by directory name.") kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param word_tokenizer: Tokenizer for breaking the text of Tweets into smaller units, including but not limited to words. """ CorpusReader.__init__(self, root, fileids, encoding) for path in self.abspaths(self._fileids): if isinstance(path, ZipFilePathPointer): pass elif os.path.getsize(path) == 0: raise ValueError("File {} is empty".format(path)) """Check that all user-created corpus files are non-empty.""" self._word_tokenizer = word_tokenizer
def __init__( self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding='latin1', ): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
def __init__(self, root, fileids=DOC_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default): """ Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """ CorpusReader.__init__(self, root, fileids, encoding) self.morphs2str = morphs2str
def __init__(self, root, fileids, tone, tag, wrap_etree=False): self.fileids = fileids self._wrap_etree = wrap_etree CorpusReader.__init__(self, root, fileids) self.tagged_sents = [] self.sents = [] self.words = [] self.tagged_words = [] self.option_tone = tone self.option_tag = tag
def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default): """ Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """ # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing # from CorpusReader? CorpusReader.__init__(self, root, fileids, encoding) self.morphs2str = morphs2str
def __init__(self, root, fileids, syntax_parser=CaboChaParser(), word_tokenizer=MeCabTokenizer(), sent_tokenizer=jp_sent_tokenizer, case_parser=KNPParser(), encoding='utf-8'): CorpusReader.__init__(self, root, fileids, encoding) self._syntax_parser = syntax_parser self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._case_parser = case_parser
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects TwitterCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, root, zipfile, fileids): if isinstance(root, basestring): root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') # convert to a ZipFilePathPointer root = ZipFilePathPointer(root.join(zipfile)) CorpusReader.__init__(self, root, fileids) self._parse_char_replacements()
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',**kwargs): """ Initialize the corpus reader. """ # add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self,kwargs) CorpusReader.__init__(self, root, fileids, encoding)
def __init__( self, root: Path = (Path(__file__).parent.resolve() / Path("../data/corpus_processed/")), fileids=r".+\.pickle", ): """ Initialize the corpus reader. Keyword Arguments: root -- Path of corpus root. fileids -- Regex pattern for documents. """ CorpusReader.__init__(self, str(root), fileids)
def __init__(self, root, fileids=DOC_PATTERN, encoding="utf8", **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # add default category pattern if not passed into the class if not any(key.startswith("cat_") for key in kwargs.keys()): kwargs["cat_pattern"] = self.CAT_PATTERN # initialize the NLTK corpus reader objects CorpusReader.__init__(self, root, fileids, encoding) CategorizedCorpusReader.__init__(self, kwargs)
def __init__(self, root, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Get the CorpusReader specific arguments fileids = kwargs.pop('fileids') encoding = kwargs.pop('encoding') # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), encoding=None): """ @param root: The root directory for this corpus. @param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader=None, self._alignedsent_block_reader = self._alignedsent_block_reader self._alignedsent_corpus_view = None
def __init__(self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf-8-sig'): CorpusReader.__init__(self, root, fileids, encoding) for path in self.abspaths(self._fileids): if isinstance(path, ZipFilePathPointer): pass elif os.path.getsize(path) == 0: raise ValueError("File {} is empty".format(path)) """Check that all user-created corpus files are non-empty.""" self._word_tokenizer = word_tokenizer
def __init__(self, root, fileids=PathPattern.doc_pattern.value, encoding='utf8', **kwargs): """ Инициализирует объект чтения промежуточного обработанных файлов корпуса. """ # Добавить шаблон категорий, если он не был передан в класс явно if not any(key.startswitch('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = PathPattern.cat_pattern.value # Инициализировать объекты чтения корпуса из NLTK CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) # Инициировать список для сбора токенов self.__tokens = Counter()
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids) self._word_tokenizer = WordPunctTokenizer() self._sent_tokenizer = nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle')
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', tags=TAGS, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) # Save the tags that we specifically want to extract. self.tags = tags
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialise the pickled corpus reader using two corpus readers from the nltk library Parameters ---------- root : str like the root directory for the corpus fileids : str like a regex pattern for the corpus document files kwargs : Additional arguements passed to the nltk corpus readers """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids)
def __init__(self, rootpath, fileids, encoding="utf8", default_text_selector=lambda row: None, **kwargs): """ Initialize CSV corpus reader Arguments: rootpath (str) - path to folder with corpus files (see NLTK CorpusReader for more info) fileids (list str) - names of files in root (see NLTK CorpusReader for more info) default_test_selector (lambda) - default selector that will be used to extract text from corpus **kwargs (named arguments) - arguemnts passed to csv.DictReader (see csv.DictReader for more info) """ # Initialize base NLTK corpus reader object CorpusReader.__init__(self, rootpath, fileids, encoding=encoding) # Initialize default selector self.__default_text_selectors = default_text_selector # Save csv parser params self.csv_kwargs = kwargs
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', tags=TAGS, **kwargs): """ Инициализирует объект чтения корпуса. Аргументы, управляющие классификацией (``cat_pattern``, ``cat_map`` и ``cat_file``), передаются в конструктор ``CategorizedCorpusReader``. остальные аргументы передаются в конструктор ``CorpusReader``. """ # Добавить шаблон категорий, если он не был передан в класс явно. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Инициализировать объекты чтения корпуса из NLTK CategorizedCorpusReader.__init__( self, kwargs) # передаются именованные аргументы CorpusReader.__init__(self, root, fileids) # Сохранить теги, подлежащие извлечению. self.tags = tags
def __init__(self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param word_tokenizer: Tokenizer for breaking the text of Tweets into smaller units, including but not limited to words. """ CorpusReader.__init__(self, root, fileids, encoding) for path in self.abspaths(self._fileids): if isinstance(path, ZipFilePathPointer): pass elif os.path.getsize(path) == 0: raise ValueError("File {} is empty".format(path)) """Check that all user-created corpus files are non-empty.""" self._word_tokenizer = word_tokenizer
def __init__(self, root, fileids, column_types=None, top_node='S', beginning_of_sentence=r'#BOS.+$', end_of_sentence=r'#EOS.+$', encoding=None): """ Construct a new corpus reader for reading NEGRA corpus files. @param root: The root directory of the corpus files. @param fileids: A list of or regex specifying the files to read from. @param column_types: An optional C{list} of columns in the corpus. @param top_node: The top node of chunked sentence trees. @param beginning_of_sentence: A regex specifying the start of a sentence @param end_of_sentence: A regex specifying the end of a sentence @param encoding: The default corpus file encoding. """ # Make sure there are no invalid column type if isinstance(column_types, list): for column_type in column_types: if column_type not in self.COLUMN_TYPES: raise ValueError("Column %r is not supported." % columntype) else: column_types = self.COLUMN_TYPES # Define stuff self._top_node = top_node self._column_types = column_types self._fileids = fileids self._bos = beginning_of_sentence self._eos = end_of_sentence self._colmap = dict((c, i) for (i, c) in enumerate(column_types)) # Finish constructing by calling the extended class' constructor CorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, events, fileids=None, encoding='utf8', tags=TAGS, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = None # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, '.', fileids, encoding) # Save the events list self.events = events self.tagger = pos_tagger('spacy') self.htmltags = tags
def __init__(self, root, fileids, column_types=None, top_node='S', beginning_of_sentence=r'#BOS.+$', end_of_sentence=r'#EOS.+$', encoding=None): """ Construct a new corpus reader for reading NEGRA corpus files. @param root: The root directory of the corpus files. @param fileids: A list of or regex specifying the files to read from. @param column_types: An optional C{list} of columns in the corpus. @param top_node: The top node of parsed sentence trees. @param beginning_of_sentence: A regex specifying the start of a sentence @param end_of_sentence: A regex specifying the end of a sentence @param encoding: The default corpus file encoding. """ # Make sure there are no invalid column type if isinstance(column_types, list): for column_type in column_types: if column_type not in self.COLUMN_TYPES: raise ValueError("Column %r is not supported." % columntype) else: column_types = self.COLUMN_TYPES # Define stuff self._top_node = top_node self._column_types = column_types self._fileids = fileids self._bos = beginning_of_sentence self._eos = end_of_sentence self._colmap = dict((c,i) for (i,c) in enumerate(column_types)) # Finish constructing by calling the extended class' constructor CorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, root, fileids): CorpusReader.__init__(self, root, fileids, None, None)
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader """ CorpusReader.__init__(self, root, fileids, **kwargs)
def __init__(self, fileids=r'.*\.review'): _root = os.path.join(susx._sussex_root, 'data/amazon_customer_reviews') self._n_sents = 140443 CorpusReader.__init__(self, _root, fileids) self._n = None
def __init__(self, root, fileids=DOC_PATTERN): """ Инициальзируем класс чтения """ CorpusReader.__init__(self, root, fileids)
def __init__(self, fileids=r'.*\.mrg'): _root = os.path.join(susx._sussex_root, 'data/penn_treebank_npbrac_stanforddeps') CorpusReader.__init__(self, _root, fileids) self._n = None self._n_sents = 51520
def __init__(self, root, fileids, wrap_etree=False): self._wrap_etree = wrap_etree CorpusReader.__init__(self, root, fileids)
def __init__(self, fileids=r'.*\.gz', data_folder=''): _root = os.path.join(susx._sussex_root, data_folder) CorpusReader.__init__(self, _root, fileids) self._n = None self._n_sents = None