def classifier(self): title_transformer = Pipeline([ ('selector1', FunctionTransformer(self.extract_title, validate=False)), ('tfidf1', MarisaTfidfVectorizer(strip_accents='ascii', ngram_range=(2, 2))) ]) content_transformer = Pipeline([ ('selector2', FunctionTransformer(self.extract_content, validate=False)), ('tfidf2', MarisaTfidfVectorizer(strip_accents='ascii', ngram_range=(2, 2))) ]) return Pipeline([ ('features', FeatureUnion( transformer_list=[('title', title_transformer), ('content', content_transformer)], transformer_weights={ 'title': 0.5, 'content': 1.0, }, )), ('sampling', RandomUnderSampler(random_state=BaseClassifier.RANDOM_SEED)), ('clf', SGDClassifier(max_iter=1000, loss='log', tol=1e-3, random_state=BaseClassifier.RANDOM_SEED)) ])
def classifier(self): return Pipeline([ ('selector', FunctionTransformer(self.extract_title, validate=False)), ('tfidf', MarisaTfidfVectorizer(strip_accents='ascii', ngram_range=(1, 3))), ('sampling', RandomUnderSampler(random_state=BaseClassifier.RANDOM_SEED)), ('clf', MultinomialNB(fit_prior=False)), ])
def classifier(self): all_stopwords = stopwords.words('english') + \ stopwords.words('portuguese') + \ stopwords.words('spanish') return MarisaTfidfVectorizer(strip_accents='ascii', ngram_range=(1, 3), max_df=0.1, min_df=5, use_idf=True, lowercase=True, stop_words=all_stopwords)
def classifier(self): return Pipeline([ ('preprocess', FunctionTransformer(self.join_text_and_content, validate=False)), ('tfidf', MarisaTfidfVectorizer(strip_accents='ascii', ngram_range=(1, 3), max_df=0.5, min_df=5, use_idf=True)), ('sampling', RandomUnderSampler(random_state=BaseClassifier.RANDOM_SEED)), ('clf', MultinomialNB(fit_prior=False)), ])