def __init__(self, fileobj, min_article_character=200, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0', ), include_interlinks=False): """ Parameters ---------- fileobj : file File descriptor of MediaWiki dump. min_article_character : int, optional Minimal number of character for article (except titles and leading gaps). processes : int, optional Number of processes, max(1, multiprocessing.cpu_count() - 1) if None. lemmatize : bool, optional If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. filter_namespaces : tuple of int, optional Enumeration of namespaces that will be ignored. include_interlinks: bool Whether or not interlinks should be included in the output """ self.fileobj = fileobj self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.min_article_character = min_article_character self.include_interlinks = include_interlinks
def __init__(self, fileobj, min_article_character=200, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',), include_interlinks=False): """ Parameters ---------- fileobj : file File descriptor of MediaWiki dump. min_article_character : int, optional Minimal number of character for article (except titles and leading gaps). processes : int, optional Number of processes, max(1, multiprocessing.cpu_count() - 1) if None. lemmatize : bool, optional If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. filter_namespaces : tuple of int, optional Enumeration of namespaces that will be ignored. include_interlinks: bool Whether or not interlinks should be included in the output """ self.fileobj = fileobj self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.min_article_character = min_article_character self.include_interlinks = include_interlinks