示例#1
0
class SVMTextEncoder(BaseEstimator, TransformerMixin):
    # number of jobs to execute in parallel
    NUM_JOBS = 3
    # number of folds to apply to svm fit
    NUM_FOLDS = 3

    # !! add tuning
    def __init__(self, metric, random_seed):
        super().__init__()

        self._vect = TfidfVectorizer(ngram_range=[1, 2], max_features=30000)
        self._random_seed = random_seed

        if metric in classification_metrics:
            self._model = LinearSVC(class_weight="balanced",
                                    random_state=random_seed)
            self.mode = "classification"
        elif metric in regression_metrics:
            self._model = LinearSVR(random_state=random_seed)
            self.mode = "regression"
        else:
            raise AttributeError(
                "metric not in classification or regression metrics")

    def fit(self, X, y):
        raise NotImplemented

    def transform(self, X):
        X = pd.Series(X.squeeze()).fillna(MISSING_VALUE_INDICATOR).values

        Xv = self._vect.transform(X)
        if self.mode == "classification":
            out = self._model.decision_function(Xv)
        else:
            out = self._model.predict(Xv)

        if len(out.shape) == 1:
            out = out.reshape(-1, 1)

        return out

    def fit_transform(self, X, y=None, **kwargs):
        assert y is not None, "SVMTextEncoder.fit_transform requires y"

        X = pd.Series(X.squeeze()).fillna(MISSING_VALUE_INDICATOR).values
        Xv = self._vect.fit_transform(X)
        self._model = self._model.fit(Xv, y)

        if self.mode == "classification":
            # Aim for NUM_FOLDS and stratified k-fold.  If that doesn't work, fallback to uniform sampling.
            num_folds = min(self.NUM_FOLDS, y.value_counts().min())
            if num_folds < 2:
                cv = KFold(n_splits=self.NUM_FOLDS,
                           random_state=self._random_seed)
                out = cross_val_predict(
                    self._model,
                    Xv,
                    y,
                    method="decision_function",
                    n_jobs=self.NUM_JOBS,
                    cv=cv,
                )
            else:
                out = cross_val_predict(
                    self._model,
                    Xv,
                    y,
                    method="decision_function",
                    n_jobs=self.NUM_JOBS,
                    cv=num_folds,
                )
        else:
            out = cross_val_predict(self._model,
                                    Xv,
                                    y,
                                    n_jobs=self.NUM_JOBS,
                                    cv=self.NUM_FOLDS)

        if len(out.shape) == 1:
            out = out.reshape(-1, 1)

        return out