def __init__( self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False ): self.verbose = 3 BibIndexDefaultTokenizer.__init__( self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup )
def __init__(self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup)
def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) self.single_initial_re = re.compile('^\w\.$') self.split_on_re = re.compile('[\.\s-]') # lastname_stopwords describes terms which should not be used for indexing, # in multiple-word last names. These are purely conjunctions, serving the # same function as the American hyphen, but using linguistic constructs. self.lastname_stopwords = set(['y', 'of', 'and', 'de'])
def __init__(self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) self.single_initial_re = re.compile('^\w\.$') self.split_on_re = re.compile('[\.\s-]') # lastname_stopwords describes terms which should not be used for indexing, # in multiple-word last names. These are purely conjunctions, serving the # same function as the American hyphen, but using linguistic constructs. self.lastname_stopwords = set(['y', 'of', 'and', 'de'])
def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): """Initialisation""" BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup)