def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) self.stem = self.get_setting(session, 'useStem', 0) self.pos = self.get_setting(session, 'pos', 0) self.onlyPos = self.get_setting(session, 'justPos', 0) self.puncRe = re.compile('[ ]([.,;:?!][ \n])') self.xml = self.get_setting(session, 'xml', 0)
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) # Load types from config types = self.get_setting(session, 'posTypes') if types: self.types = types.split() else: # Default to nouns self.types = ['NN', 'NNP', 'NNS'] # Should we keep the /POS tag or strip it self.keepPos = self.get_setting(session, 'pos', 0)
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) fltr = self.get_setting(session, 'filter') if fltr[-6:] == 'Filter' and hasattr(lucene, fltr): self.filter = getattr(lucene, fltr) else: raise ConfigFileException("Unknown Filter") # eg SnowballFilter(strm, 'English') # For more complex filter constructors, just subclass # FilterNormalizer as required arg1 = self.get_setting(session, 'argument', '') if arg1: self.argument = arg1 else: self.argument = None
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) match = self.get_setting(session, 'regexp', '') if not match: match = self.get_setting(session, 'pattern') if not match: match = "((?:[ ][^\\s]+/JJ[SR]?)*)((?:[ ][^\\s]+/NN[SP]?)+)" else: match = match.replace('*', '*)') match = match.replace('+', '+)') match = match.replace('?', '?)') match = match.replace('JJ', '((?:[ ][^\\s]+/JJ[SR]?)') match = match.replace('NN', '((?:[ ][^\\s]+/NN[SP]*)') self.pattern = re.compile(match) self.strip = re.compile('/(JJ[SR]?|NN[SP]*)|/(jj[sr]?|nn[sp]*)') self.minimum = self.get_setting(session, 'minimumWords', 0) self.subPhrases = self.get_setting(session, 'subPhrases', 0)
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) raise MissingDependencyException(self.objectType, "lucene")
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) self.filter = lucene.StopFilter
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) self.novels = ['BH', 'BR', 'DC', 'DS', 'ED', 'GE', 'HT', 'LD', 'MC', 'NN', 'OCS', 'OMF', 'OT', 'PP', 'TTC']
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) self.dickens = ['AN', 'BH', 'BL', 'BR', 'CC', 'CHI', 'CH', 'DC', 'DS', 'ED', 'GE', 'HM', 'HT', 'LD', 'MC', 'NN', 'OCS', 'OMF', 'OT', 'PP', 'SB', 'TTC', 'UT']