示例#1
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        y_ = y.copy()
        orig_cols = list(X.names)

        self.loaded = False

        self.load_path = get_value(config, self.load_key)
        self.save_path = get_value(config, self.save_key)

        if self.load_path:
            data = joblib.load(self.load_path)
            self.tfidf_objs = data["tf_idf_obj"]
            self.tf_idf_data = data["tf_idf_data"]
            self.prev_params = data["params"]
            self.target = data["target"]
            self.loaded = True

        if not self.loaded:
            if self.num_classes >= 2:
                lb = LabelEncoder()
                lb.fit(self.labels)
                y = lb.transform(y)

            self.tfidf_objs = {}
            self.tf_idf_data = {}
            new_X = None
            for col in X.names:
                XX = X[:, col].to_pandas()
                XX = XX[col].astype(str).fillna("NA").values.tolist()
                tfidf_vec = TfidfVectorizer(**self.return_tfidf_params())
                tfidf_vec = self._fit_vectorizer(tfidf_vec, XX)
                XX = tfidf_vec.transform(XX)
                tfidf_vec.N_ = XX.shape[0]
                self.tfidf_objs[col] = tfidf_vec
                self.tf_idf_data[col] = XX
                if new_X is None:
                    new_X = XX
                else:
                    new_X = sp.sparse.hstack([new_X, XX])
        else:
            y_ = np.hstack([self.target, y_])
            y = y_.copy()
            if self.num_classes >= 2:
                lb = LabelEncoder()
                lb.fit(self.labels)
                y = lb.transform(y)

            new_X = None
            for col in X.names:
                XX = X[:, col].to_pandas()
                XX = XX[col].astype(str).fillna("NA").values.tolist()
                N_ = len(XX)
                tfidf_vec = TfidfVectorizer()
                tfidf_vec.set_params(**self.tfidf_objs[col].get_params())
                try:
                    tfidf_vec.fit(XX)
                    new_data_avail = True
                except ValueError:
                    new_data_avail = False
                if new_data_avail:
                    tfidf_vec.N_ = N_
                    pre_trained = self.tfidf_objs[col]
                    pre_trained = self.sync_vectorizers(pre_trained, tfidf_vec)
                else:
                    pre_trained = self.tfidf_objs[col]

                XX = pre_trained.transform(XX)
                self.tfidf_objs[col] = pre_trained

                XX = self.sync_tfidf(self.tf_idf_data[col], XX)
                self.tf_idf_data[col] = XX
                if new_X is None:
                    new_X = XX
                else:
                    new_X = sp.sparse.hstack([new_X, XX])

        models = [LogisticRegression(**self.return_lin_params())]
        if self.params["add_rf"]:
            from h2oaicore.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb
            import lightgbm as lgbm
            models.append(
                lgbm.LGBMClassifier(
                    boosting_type='rf',
                    colsample_bytree=.5,
                    subsample=.632,  # Standard RF bagging fraction
                    min_child_weight=2.5,
                    min_child_samples=5,
                    subsample_freq=1,
                    min_split_gain=0,
                    n_jobs=-1,
                    **self.return_rf_params()))

        for m in models:
            m.fit(new_X, y)

        importances = [1] * len(orig_cols)
        self.set_model_properties(model={
            "model": models,
            "tf-idfs": self.tfidf_objs
        },
                                  features=orig_cols,
                                  importances=importances,
                                  iterations=0)
        if self.save_path:
            joblib.dump(
                {
                    "tf_idf_obj": self.tfidf_objs,
                    "tf_idf_data": self.tf_idf_data,
                    "params": self.params,
                    "target": y_,
                }, self.save_path)
        # clear large objects to avoid large data in subprocess pipe
        self.tfidf_objs = None
        self.tf_idf_data = None
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        y_ = y.copy()
        orig_cols = list(X.names)
        text_names = X[:, [str]].names

        self.loaded = False

        self.load_path = get_value(config, self.load_key)
        self.save_path = get_value(config, self.save_key)

        if self.load_path:
            data = joblib.load(self.load_path)
            self.tfidf_objs = data["tf_idf_obj"]
            self.tf_idf_data = data["tf_idf_data"]
            self.prev_params = data["params"]
            self.target = data["target"]
            self.loaded = True

        if not self.loaded:
            if self.num_classes >= 2:
                lb = LabelEncoder()
                lb.fit(self.labels)
                y = lb.transform(y)

            self.tfidf_objs = {}
            self.tf_idf_data = {}
            new_X = None
            for col in text_names:
                XX = X[:, col].to_pandas()
                XX = XX[col].astype(str).fillna("NA").values.tolist()
                tfidf_vec = TfidfVectorizer(**self.return_tfidf_params())
                try:
                    tfidf_vec = self._fit_vectorizer(tfidf_vec, XX)
                except ValueError as e:
                    if 'vocab' in str(e):
                        # skip non-text-like column
                        continue
                    else:
                        raise
                XX = tfidf_vec.transform(XX)
                tfidf_vec.N_ = XX.shape[0]
                self.tfidf_objs[col] = tfidf_vec
                self.tf_idf_data[col] = XX
                if new_X is None:
                    new_X = XX
                else:
                    new_X = sp.sparse.hstack([new_X, XX])
        else:
            y_ = np.hstack([self.target, y_])
            y = y_.copy()
            if self.num_classes >= 2:
                lb = LabelEncoder()
                lb.fit(self.labels)
                y = lb.transform(y)

            new_X = None
            for col in text_names:
                XX = X[:, col].to_pandas()
                XX = XX[col].astype(str).fillna("NA").values.tolist()
                N_ = len(XX)
                tfidf_vec = TfidfVectorizer()
                tfidf_vec.set_params(**self.tfidf_objs[col].get_params())
                try:
                    tfidf_vec.fit(XX)
                    new_data_avail = True
                except ValueError as e:
                    if 'vocab' in str(e):
                        # skip non-text-like column
                        continue
                    new_data_avail = False
                if new_data_avail:
                    tfidf_vec.N_ = N_
                    pre_trained = self.tfidf_objs[col]
                    pre_trained = self.sync_vectorizers(pre_trained, tfidf_vec)
                else:
                    pre_trained = self.tfidf_objs[col]

                XX = pre_trained.transform(XX)
                self.tfidf_objs[col] = pre_trained

                XX = self.sync_tfidf(self.tf_idf_data[col], XX)
                self.tf_idf_data[col] = XX
                if new_X is None:
                    new_X = XX
                else:
                    new_X = sp.sparse.hstack([new_X, XX])

        models = [LogisticRegression(**self.return_lin_params())]
        if self.params["add_rf"]:
            from h2oaicore.lightgbm_dynamic import import_lightgbm
            lgbm = import_lightgbm()
            import lightgbm as lgbm
            models.append(lgbm.LGBMClassifier(
                boosting_type='rf',
                colsample_bytree=.5,
                subsample=.632,  # Standard RF bagging fraction
                min_child_weight=2.5,
                min_child_samples=5,
                subsample_freq=1,
                min_split_gain=0,
                n_jobs=-1,
                **self.return_rf_params()
            ))

        for mi, m in enumerate(models):
            try:
                m.fit(new_X, y)
            except ValueError as e:
                # general mutation as specified is not alllowed, see logistic_regression recipe.
                # Could use restricted choices there, but for simplicity just ignore the error entirely
                if mi == 0:
                    raise IgnoreEntirelyError(str(e))
                raise

        importances = [1] * len(orig_cols)
        self.set_model_properties(
            model={
                "model": models,
                "tf-idfs": self.tfidf_objs
            },
            features=orig_cols,
            importances=importances,
            iterations=0
        )
        if self.save_path:
            joblib.dump({
                "tf_idf_obj": self.tfidf_objs,
                "tf_idf_data": self.tf_idf_data,
                "params": self.params,
                "target": y_,
            },
                self.save_path
            )
        # clear large objects to avoid large data in subprocess pipe
        self.tfidf_objs = None
        self.tf_idf_data = None