예제 #1
0
    def classifier(self):
        title_transformer = Pipeline([
            ('selector1',
             FunctionTransformer(self.extract_title, validate=False)),
            ('tfidf1',
             MarisaTfidfVectorizer(strip_accents='ascii', ngram_range=(2, 2)))
        ])

        content_transformer = Pipeline([
            ('selector2',
             FunctionTransformer(self.extract_content, validate=False)),
            ('tfidf2',
             MarisaTfidfVectorizer(strip_accents='ascii', ngram_range=(2, 2)))
        ])

        return Pipeline([
            ('features',
             FeatureUnion(
                 transformer_list=[('title', title_transformer),
                                   ('content', content_transformer)],
                 transformer_weights={
                     'title': 0.5,
                     'content': 1.0,
                 },
             )),
            ('sampling',
             RandomUnderSampler(random_state=BaseClassifier.RANDOM_SEED)),
            ('clf',
             SGDClassifier(max_iter=1000,
                           loss='log',
                           tol=1e-3,
                           random_state=BaseClassifier.RANDOM_SEED))
        ])
예제 #2
0
 def classifier(self):
     return Pipeline([
         ('selector', FunctionTransformer(self.extract_title,
                                          validate=False)),
         ('tfidf',
          MarisaTfidfVectorizer(strip_accents='ascii', ngram_range=(1, 3))),
         ('sampling',
          RandomUnderSampler(random_state=BaseClassifier.RANDOM_SEED)),
         ('clf', MultinomialNB(fit_prior=False)),
     ])
예제 #3
0
    def classifier(self):
        all_stopwords = stopwords.words('english') + \
            stopwords.words('portuguese') + \
            stopwords.words('spanish')

        return MarisaTfidfVectorizer(strip_accents='ascii',
                                     ngram_range=(1, 3),
                                     max_df=0.1,
                                     min_df=5,
                                     use_idf=True,
                                     lowercase=True,
                                     stop_words=all_stopwords)
예제 #4
0
 def classifier(self):
     return Pipeline([
         ('preprocess',
          FunctionTransformer(self.join_text_and_content, validate=False)),
         ('tfidf',
          MarisaTfidfVectorizer(strip_accents='ascii',
                                ngram_range=(1, 3),
                                max_df=0.5,
                                min_df=5,
                                use_idf=True)),
         ('sampling',
          RandomUnderSampler(random_state=BaseClassifier.RANDOM_SEED)),
         ('clf', MultinomialNB(fit_prior=False)),
     ])