Пример #1
0
    def train(self):
        train, test = stratified_split(self.__data)

        X_train = train.iloc[:, 0].values
        y_train = train.iloc[:, 1].values
        X_test = test.iloc[:, 0].values
        y_test = test.iloc[:, 1].values

        self.clf_ = self.__cross_valid(X_train, y_train)
        logging.info('best clf:\n%s' % print_pipe_model(self.clf_))

        # predict
        #cross_val_predict(clf, X_train, y_train, cv=5)
        y_train_pred = self.clf_.predict(X_train)
        y_test_pred = self.clf_.predict(X_test)
        logging.info('data shape: %s' % X_test.shape)

        #confusion matrix
        self.confusion_matrix_ = confusion_matrix(y_test, y_test_pred)
        logging.info('confusion matrix:\n%s' % pformat(self.confusion_matrix_))

        # wrong prediction
        self.false_neg_ = test[(y_test == 1) & (y_test_pred == 0)]
        self.false_pos_ = test[(y_test == 0) & (y_test_pred == 1)]

        # evaluate
        self.train_score_ = evaluate(y_train, y_train_pred)
        self.test_score_ = evaluate(y_test, y_test_pred)

        # most importance features
        need_std = get_pipe_model(self.clf_, 'std') is None
        self.influence_ = self._get_influence_ft(self.clf_, X_train) if need_std else \
                          self._get_influence_ft(self.clf_)
        logging.info('Influence:\n%s\n...\n%s' %
                     (self.influence_.head(), self.influence_.tail()))
Пример #2
0
    def _get_influence_ft(self, model, X=None):
        tfidf = get_pipe_model(model, 'tfidf')

        features = tfidf.get_feature_names()
        influences = self._get_influences(model)

        if len(features) != len(influences):
            return pd.DataFrame(
                {
                    'feature': features,
                    'influence': np.zeros(len(features))
                },
                columns=['feature', 'influence'])

        if X is not None:
            influences = self._calc_std_influences(influences,
                                                   tfidf.transform(X))

        df = pd.DataFrame({
            'feature': features,
            'influence': influences
        },
                          columns=['feature', 'influence'])

        df.sort_values(by='influence', ascending=False, inplace=True)
        return df
Пример #3
0
def get_decision_features(clf, x):
    tfidf = get_pipe_model(clf, 'tfidf')
    logre = get_pipe_model(clf, 'logre')

    # weights of features
    x_trans = get_pipe_transform(clf, x).toarray().reshape(-1)
    coef = logre.coef_.reshape(-1)
    weights = [ (i, v*w) for i, v, w in zip(range(len(x_trans)), x_trans, coef) if v and w ]
    weights = sorted(weights, key=itemgetter(1), reverse=True)

    # trim non-contributing weights
    y = logre.intercept_[0]
    for i, (_, w) in enumerate(weights):
        if y > 0:   # the smaple has became positive
            break
        y += w
    weights = weights[:i]

    # contributing features and weight ratio
    features = tfidf.get_feature_names()
    w_sum = sum([w for _, w in weights])
    decision_fts = [ (w/w_sum, features[i] ) for i, w in weights ]    
    return decision_fts
Пример #4
0
 def _get_influences(self, model):
     return get_pipe_model(model, -1).feature_importances_
Пример #5
0
 def _get_influences(self, model):
     return get_pipe_model(model, -1).coef_[0]