class SVMTextEncoder(BaseEstimator, TransformerMixin): # number of jobs to execute in parallel NUM_JOBS = 3 # number of folds to apply to svm fit NUM_FOLDS = 3 # !! add tuning def __init__(self, metric, random_seed): super().__init__() self._vect = TfidfVectorizer(ngram_range=[1, 2], max_features=30000) self._random_seed = random_seed if metric in classification_metrics: self._model = LinearSVC(class_weight="balanced", random_state=random_seed) self.mode = "classification" elif metric in regression_metrics: self._model = LinearSVR(random_state=random_seed) self.mode = "regression" else: raise AttributeError( "metric not in classification or regression metrics") def fit(self, X, y): raise NotImplemented def transform(self, X): X = pd.Series(X.squeeze()).fillna(MISSING_VALUE_INDICATOR).values Xv = self._vect.transform(X) if self.mode == "classification": out = self._model.decision_function(Xv) else: out = self._model.predict(Xv) if len(out.shape) == 1: out = out.reshape(-1, 1) return out def fit_transform(self, X, y=None, **kwargs): assert y is not None, "SVMTextEncoder.fit_transform requires y" X = pd.Series(X.squeeze()).fillna(MISSING_VALUE_INDICATOR).values Xv = self._vect.fit_transform(X) self._model = self._model.fit(Xv, y) if self.mode == "classification": # Aim for NUM_FOLDS and stratified k-fold. If that doesn't work, fallback to uniform sampling. num_folds = min(self.NUM_FOLDS, y.value_counts().min()) if num_folds < 2: cv = KFold(n_splits=self.NUM_FOLDS, random_state=self._random_seed) out = cross_val_predict( self._model, Xv, y, method="decision_function", n_jobs=self.NUM_JOBS, cv=cv, ) else: out = cross_val_predict( self._model, Xv, y, method="decision_function", n_jobs=self.NUM_JOBS, cv=num_folds, ) else: out = cross_val_predict(self._model, Xv, y, n_jobs=self.NUM_JOBS, cv=self.NUM_FOLDS) if len(out.shape) == 1: out = out.reshape(-1, 1) return out