def __init__(self, session, node, parent): SimpleExtractor.__init__(self, session, node, parent) if nltk is None: raise MissingDependencyException(self.objectType, "nltk") # Load types from config types = self.get_setting(session, "entityTypes") if types: self.types = [] for type_ in types.split(): type_ = type_.lower() if type_.startswith("pe"): self.types.append("PERSON") elif type_.startswith(("pl", "g")): self.types.append("GPE") elif type_.startswith(("org", "co")): self.types.append("ORGANIZATION") else: msg = "Unknown entity type setting {0} on {1} {2}" "".format( type_, self.__class__.__name__, self.id ) raise ConfigFileException(msg) else: # Default to all self.types = ["PERSON", "GPE", "ORGANIZATION"] # Should we keep the /POS tag or strip it self.keepPos = self.get_setting(session, "pos", 0)
def __init__(self, session, node, parent): SimpleExtractor.__init__(self, session, node, parent) if nltk is None: raise MissingDependencyException(self.objectType, 'nltk') # Load types from config types = self.get_setting(session, 'entityTypes') if types: self.types = [] for type_ in types.split(): type_ = type_.lower() if type_.startswith('pe'): self.types.append('PERSON') elif type_.startswith(('pl', 'g')): self.types.append('GPE') elif type_.startswith(('org', 'co')): self.types.append('ORGANIZATION') else: msg = ("Unknown entity type setting {0} on {1} {2}" "".format(type_, self.__class__.__name__, self.id)) raise ConfigFileException(msg) else: # Default to all self.types = ['PERSON', 'GPE', 'ORGANIZATION'] # Should we keep the /POS tag or strip it self.keepPos = self.get_setting(session, 'pos', 0)
def __init__(self, session, node, parent): SimpleExtractor.__init__(self, session, node, parent) # Load types from config types = self.get_setting(session, 'entityTypes') if types: self.types = [] for type_ in types.split(): type_ = type_.lower() if type_.startswith('pe'): self.types.append('PERSON') elif type_.startswith(('pl', 'g')): self.types.append('GPE') elif type_.startswith(('org', 'co')): self.types.append('ORGANIZATION') else: msg = ("Unknown entity type setting {0} on {1} {2}" "".format(type_, self.__class__.__name__, self.id) ) raise ConfigFileException(msg) else: # Default to all self.types = ['PERSON', 'GPE', 'ORGANIZATION'] # Should we keep the /POS tag or strip it self.keepPos = self.get_setting(session, 'pos', 0)
def __init__(self, session, config, parent): SimpleExtractor.__init__(self, session, config, parent) # default: <w p="POS" s="STEM" o="OFFSET">TEXT</w> # --> TEXT/POS/STEM/OFFSET # XXX Can we xpathProcessor-ify these xpaths? # too computationally expensive to bother? xpaths = self.get_setting(session, 'subXpaths', 'word|./text()| pos|./@p|XX stem|./@s|./text() offset|./@o|-1') xps = xpaths.split(' ') self.xpaths = [x.split('|') for x in xps] self.xpath = self.get_setting(session, 'xpath', 'toks/w') self.template = self.get_setting(session, 'template', '%(word)s/%(pos)s/%(stem)s/%(offset)s')
def __init__(self, session, config, parent): SimpleExtractor.__init__(self, session, config, parent) # default: <w p="POS" s="STEM" o="OFFSET">TEXT</w> # --> TEXT/POS/STEM/OFFSET # XXX Can we xpathProcessor-ify these xpaths? # too computationally expensive to bother? xpaths = self.get_setting( session, 'subXpaths', 'word|./text()| pos|./@p|XX stem|./@s|./text() offset|./@o|-1') xps = xpaths.split(' ') self.xpaths = [x.split('|') for x in xps] self.xpath = self.get_setting(session, 'xpath', 'toks/w') self.template = self.get_setting( session, 'template', '%(word)s/%(pos)s/%(stem)s/%(offset)s')
def process_eventList(self, session, data): simpleHash = SimpleExtractor.process_eventList(self, session, data) return self._process_simpleHash(simpleHash)
def process_string(self, session, data): simpleHash = SimpleExtractor.process_string(self, session, data) return self._process_simpleHash(simpleHash)
def __init__(self, session, config, parent): SimpleExtractor.__init__(self, session, config, parent) self.jchr = self.get_setting(session, 'joinCharacter', u' ')
def __init__(self, session, config, parent): SimpleExtractor.__init__(self, session, config, parent) self.pos = self.get_setting(session, 'pos', 0) self.stem = self.get_setting(session, 'stem', 0) self.offset = self.get_setting(session, 'offset', 0)
def __init__(self, session, config, parent): SimpleExtractor.__init__(self, session, config, parent) raise MissingDependencyException(self.objectType, 'rdflib')