def __init__(self, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None): self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer() self.pos_tagger = pos_tagger if pos_tagger is not None \ else PatternTagger(tokenizer=self.tokenizer) self.np_extractor = np_extractor if np_extractor is not None \ else PatternParserNPExtractor(tokenizer=self.tokenizer) self.analyzer = analyzer if analyzer is not None \ else PatternAnalyzer(tokenizer=self.tokenizer) self.parser = parser if parser is not None \ else PatternParser(tokenizer=self.tokenizer) self.classifier = classifier if classifier is not None else None _initialize_models( self, self.tokenizer, self.pos_tagger, self.np_extractor, self.analyzer, self.parser, self.classifier)
def __init__(self, text, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None, clean_html=False): self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer( ) self.pos_tagger = pos_tagger if pos_tagger is not None else PatternTagger( tokenizer=self.tokenizer) self.np_extractor = np_extractor if np_extractor is not None \ else PatternParserNPExtractor(tokenizer=self.tokenizer) self.analyzer = analyzer if analyzer is not None \ else PatternAnalyzer(tokenizer=self.tokenizer) self.parser = parser if parser is not None \ else PatternParser(tokenizer=self.tokenizer) self.classifier = classifier if classifier is not None else None if not isinstance(text, basestring): raise TypeError('The `text` argument passed to `__init__(text)` ' 'must be a string, not {0}'.format(type(text))) if clean_html: raise NotImplementedError( "clean_html has been deprecated. " "To remove HTML markup, use BeautifulSoup's " "get_text() function") self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) _initialize_models(self, self.tokenizer, self.pos_tagger, self.np_extractor, self.analyzer, self.parser, self.classifier)
def __init__(self, text, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None, clean_html=False): self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer() self.pos_tagger = pos_tagger if pos_tagger is not None else PatternTagger( tokenizer=self.tokenizer) self.np_extractor = np_extractor if np_extractor is not None \ else PatternParserNPExtractor(tokenizer=self.tokenizer) self.analyzer = analyzer if analyzer is not None \ else PatternAnalyzer(tokenizer=self.tokenizer) self.parser = parser if parser is not None \ else PatternParser(tokenizer=self.tokenizer) self.classifier = classifier if classifier is not None else None if not isinstance(text, basestring): raise TypeError('The `text` argument passed to `__init__(text)` ' 'must be a string, not {0}'.format(type(text))) if clean_html: raise NotImplementedError( "clean_html has been deprecated. " "To remove HTML markup, use BeautifulSoup's " "get_text() function") self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) _initialize_models( self, self.tokenizer, self.pos_tagger, self.np_extractor, self.analyzer, self.parser, self.classifier)