def __init__(self, tags_only=True, input='content', encoding='utf-8', charset=None, decode_error='strict', charset_error=None, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): super(NERVectorizer, self).__init__(input=input, charset=charset, charset_error=charset_error, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=False, dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) self.tags_only = tags_only # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding) self.tagger = SocketNER(host='localhost', port=config.NER_PORT, collapse=False)
class NERVectorizer(TfidfVectorizer): def __init__(self, tags_only=True, input='content', encoding='utf-8', charset=None, decode_error='strict', charset_error=None, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): super(NERVectorizer, self).__init__( input=input, charset=charset, charset_error=charset_error, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=False, dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) self.tags_only = tags_only # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding) self.tagger = SocketNER(host='localhost', port=config.NER_PORT, collapse=False) def build_analyzer(self): """Return a callable that handles preprocessing and tokenization""" # preprocess = self.build_preprocessor() # tokenizer = self.build_tokenizer() # tokenize = lambda doc: tokenizer(preprocess(self.decode(doc))) # get_tags = lambda doc: [tag for tag in self.tagger.get_entities(doc).iterkeys()] if self.tags_only: get_tags = lambda doc: [t[0] for t in self.tagger.get_entities(doc)] else: get_tags = lambda doc: list(chain.from_iterable(self.tagger.get_entities(doc))) # if self.tags_only: # get_tags = lambda doc: [t[1] for t in self.tagger.tag(tokenize(doc))] # else: # get_tags = lambda doc: list(chain.from_iterable(self.tagger.tag(tokenize(doc)))) return lambda doc: self._word_ngrams(get_tags(doc))
def preprocess(self, pos=False, ner=False, tok_q=True): log.debug("preprocessing documents") if tok_q: self.tok_question = unicode(self.question).translate( self.delete_punctuation_map) self.tok_question = nltk.word_tokenize(self.tok_question.lower()) self.tok_question = [ self.lem.lemmatize(word) for word in self.tok_question ] if pos: # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs] self.pos_docs = [ nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs ] if ner: self.ner = SocketNER(host='localhost', port=config.NER_PORT, collapse=False) self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs]
def preprocess(self, pos=False, ner=False, tok_q=True): log.debug("preprocessing documents") if tok_q: self.tok_question = unicode(self.question).translate(self.delete_punctuation_map) self.tok_question = nltk.word_tokenize(self.tok_question.lower()) self.tok_question = [self.lem.lemmatize(word) for word in self.tok_question] if pos: # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs] self.pos_docs = [nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs] if ner: self.ner = SocketNER(host='localhost', port=config.NER_PORT, collapse=False) self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs]
class BaseExtractor(object): def __init__(self, question, docs): self.docs = docs self.question = question self.lem = nltk.stem.wordnet.WordNetLemmatizer() self.delete_punctuation_map = dict((ord(char), None) for char in string.punctuation) def preprocess(self, pos=False, ner=False, tok_q=True): log.debug("preprocessing documents") if tok_q: self.tok_question = unicode(self.question).translate(self.delete_punctuation_map) self.tok_question = nltk.word_tokenize(self.tok_question.lower()) self.tok_question = [self.lem.lemmatize(word) for word in self.tok_question] if pos: # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs] self.pos_docs = [nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs] if ner: self.ner = SocketNER(host='localhost', port=config.NER_PORT, collapse=False) self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs] def clean(self, s): return self.lem.lemmatize(unicode(s).translate(self.delete_punctuation_map).lower()) def sort_candidates(self, candidates): """ Takes a dict with frequencies {'a':2, 'b':4, 'c':1} and sorts them. Returns the list of sorted candidates with percentages. """ if len(candidates) == 0: return None # automatically creates nested dict when they don't exist cleaned = defaultdict(dict) for item, count in candidates.iteritems(): cleaned[self.clean(item)][item] = count results = {} for item, options in cleaned.iteritems(): selected_option, max_count, total_count = None, 0, 0 for option, count in options.iteritems(): total_count += count if count > max_count: selected_option, max_count = option, count results[selected_option] = total_count results = sorted(results.iteritems(), key=itemgetter(1), reverse=True) total = sum(count for item, count in results) # trim to first 10 items return [(item, count/float(total)) for item, count in results][:10] def answer(self): """ Answer should return a sorted list of answer tuples with their confidence """ return "I don't know how to answer that type of question yet"
def __init__(self, tags_only=True, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): super(NERVectorizer, self).__init__( input=input, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=False, dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) self.tags_only = tags_only # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding) self.tagger = SocketNER(host='localhost', port=config.NER_PORT, collapse=False)
class NERVectorizer(TfidfVectorizer): def __init__(self, tags_only=True, input='content', encoding='utf-8', charset=None, decode_error='strict', charset_error=None, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): super(NERVectorizer, self).__init__(input=input, charset=charset, charset_error=charset_error, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=False, dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) self.tags_only = tags_only # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding) self.tagger = SocketNER(host='localhost', port=config.NER_PORT, collapse=False) def build_analyzer(self): """Return a callable that handles preprocessing and tokenization""" # preprocess = self.build_preprocessor() # tokenizer = self.build_tokenizer() # tokenize = lambda doc: tokenizer(preprocess(self.decode(doc))) # get_tags = lambda doc: [tag for tag in self.tagger.get_entities(doc).iterkeys()] if self.tags_only: get_tags = lambda doc: [ t[0] for t in self.tagger.get_entities(doc) ] else: get_tags = lambda doc: list( chain.from_iterable(self.tagger.get_entities(doc))) # if self.tags_only: # get_tags = lambda doc: [t[1] for t in self.tagger.tag(tokenize(doc))] # else: # get_tags = lambda doc: list(chain.from_iterable(self.tagger.tag(tokenize(doc)))) return lambda doc: self._word_ngrams(get_tags(doc))
class BaseExtractor(object): def __init__(self, question, docs): self.docs = docs self.question = question self.lem = nltk.stem.wordnet.WordNetLemmatizer() self.delete_punctuation_map = dict( (ord(char), None) for char in string.punctuation) def preprocess(self, pos=False, ner=False, tok_q=True): log.debug("preprocessing documents") if tok_q: self.tok_question = unicode(self.question).translate( self.delete_punctuation_map) self.tok_question = nltk.word_tokenize(self.tok_question.lower()) self.tok_question = [ self.lem.lemmatize(word) for word in self.tok_question ] if pos: # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs] self.pos_docs = [ nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs ] if ner: self.ner = SocketNER(host='localhost', port=config.NER_PORT, collapse=False) self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs] def clean(self, s): return self.lem.lemmatize( unicode(s).translate(self.delete_punctuation_map).lower()) def sort_candidates(self, candidates): """ Takes a dict with frequencies {'a':2, 'b':4, 'c':1} and sorts them. Returns the list of sorted candidates with percentages. """ if len(candidates) == 0: return None # automatically creates nested dict when they don't exist cleaned = defaultdict(dict) for item, count in candidates.iteritems(): cleaned[self.clean(item)][item] = count results = {} for item, options in cleaned.iteritems(): selected_option, max_count, total_count = None, 0, 0 for option, count in options.iteritems(): total_count += count if count > max_count: selected_option, max_count = option, count results[selected_option] = total_count results = sorted(results.iteritems(), key=itemgetter(1), reverse=True) total = sum(count for item, count in results) # trim to first 10 items return [(item, count / float(total)) for item, count in results][:10] def answer(self): """ Answer should return a sorted list of answer tuples with their confidence """ return "I don't know how to answer that type of question yet"