def __init__(self, root, fileids=DOC_PATTERN, tags=None, word_tokenizer=WordPunctTokenizer(), sent_tokenizer=nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle'), encoding='utf8', **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._good_tags = tags or self.TAGS
def __init__( self, root, fileids=DOC_PATTERN, encoding='utf8', **kwargs ): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): # First, try to build a cat_map from standard-style filenames try: kwargs['cat_map'] = make_cat_map(root, 'txt') # On error, fall back to dir names for categories except Exception as e: print(type(e), e, "\nUnable to build category map from file names.\nFalling back to categories by directory name.") kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',**kwargs): """ Initialize the corpus reader. """ # add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self,kwargs) CorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, root, fileids=DOC_PATTERN, encoding="utf8", **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # add default category pattern if not passed into the class if not any(key.startswith("cat_") for key in kwargs.keys()): kwargs["cat_pattern"] = self.CAT_PATTERN # initialize the NLTK corpus reader objects CorpusReader.__init__(self, root, fileids, encoding) CategorizedCorpusReader.__init__(self, kwargs)
def __init__(self, root, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Get the CorpusReader specific arguments fileids = kwargs.pop('fileids') encoding = kwargs.pop('encoding') # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', tags=TAGS, **kwargs): """Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor.""" # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) # Save the tags that we specifically want to extract. self.tags = tags
def __init__(self, root, fileids=PathPattern.doc_pattern.value, encoding='utf8', **kwargs): """ Инициализирует объект чтения промежуточного обработанных файлов корпуса. """ # Добавить шаблон категорий, если он не был передан в класс явно if not any(key.startswitch('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = PathPattern.cat_pattern.value # Инициализировать объекты чтения корпуса из NLTK CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) # Инициировать список для сбора токенов self.__tokens = Counter()
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids) self._word_tokenizer = WordPunctTokenizer() self._sent_tokenizer = nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle')
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', tags=TAGS, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) # Save the tags that we specifically want to extract. self.tags = tags
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialise the pickled corpus reader using two corpus readers from the nltk library Parameters ---------- root : str like the root directory for the corpus fileids : str like a regex pattern for the corpus document files kwargs : Additional arguements passed to the nltk corpus readers """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', tags=TAGS, **kwargs): """ Инициализирует объект чтения корпуса. Аргументы, управляющие классификацией (``cat_pattern``, ``cat_map`` и ``cat_file``), передаются в конструктор ``CategorizedCorpusReader``. остальные аргументы передаются в конструктор ``CorpusReader``. """ # Добавить шаблон категорий, если он не был передан в класс явно. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Инициализировать объекты чтения корпуса из NLTK CategorizedCorpusReader.__init__( self, kwargs) # передаются именованные аргументы CorpusReader.__init__(self, root, fileids) # Сохранить теги, подлежащие извлечению. self.tags = tags
def __init__(self, events, fileids=None, encoding='utf8', tags=TAGS, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = None # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, '.', fileids, encoding) # Save the events list self.events = events self.tagger = pos_tagger('spacy') self.htmltags = tags