def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): y_ = y.copy() orig_cols = list(X.names) self.loaded = False self.load_path = get_value(config, self.load_key) self.save_path = get_value(config, self.save_key) if self.load_path: data = joblib.load(self.load_path) self.tfidf_objs = data["tf_idf_obj"] self.tf_idf_data = data["tf_idf_data"] self.prev_params = data["params"] self.target = data["target"] self.loaded = True if not self.loaded: if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) self.tfidf_objs = {} self.tf_idf_data = {} new_X = None for col in X.names: XX = X[:, col].to_pandas() XX = XX[col].astype(str).fillna("NA").values.tolist() tfidf_vec = TfidfVectorizer(**self.return_tfidf_params()) tfidf_vec = self._fit_vectorizer(tfidf_vec, XX) XX = tfidf_vec.transform(XX) tfidf_vec.N_ = XX.shape[0] self.tfidf_objs[col] = tfidf_vec self.tf_idf_data[col] = XX if new_X is None: new_X = XX else: new_X = sp.sparse.hstack([new_X, XX]) else: y_ = np.hstack([self.target, y_]) y = y_.copy() if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) new_X = None for col in X.names: XX = X[:, col].to_pandas() XX = XX[col].astype(str).fillna("NA").values.tolist() N_ = len(XX) tfidf_vec = TfidfVectorizer() tfidf_vec.set_params(**self.tfidf_objs[col].get_params()) try: tfidf_vec.fit(XX) new_data_avail = True except ValueError: new_data_avail = False if new_data_avail: tfidf_vec.N_ = N_ pre_trained = self.tfidf_objs[col] pre_trained = self.sync_vectorizers(pre_trained, tfidf_vec) else: pre_trained = self.tfidf_objs[col] XX = pre_trained.transform(XX) self.tfidf_objs[col] = pre_trained XX = self.sync_tfidf(self.tf_idf_data[col], XX) self.tf_idf_data[col] = XX if new_X is None: new_X = XX else: new_X = sp.sparse.hstack([new_X, XX]) models = [LogisticRegression(**self.return_lin_params())] if self.params["add_rf"]: from h2oaicore.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb import lightgbm as lgbm models.append( lgbm.LGBMClassifier( boosting_type='rf', colsample_bytree=.5, subsample=.632, # Standard RF bagging fraction min_child_weight=2.5, min_child_samples=5, subsample_freq=1, min_split_gain=0, n_jobs=-1, **self.return_rf_params())) for m in models: m.fit(new_X, y) importances = [1] * len(orig_cols) self.set_model_properties(model={ "model": models, "tf-idfs": self.tfidf_objs }, features=orig_cols, importances=importances, iterations=0) if self.save_path: joblib.dump( { "tf_idf_obj": self.tfidf_objs, "tf_idf_data": self.tf_idf_data, "params": self.params, "target": y_, }, self.save_path) # clear large objects to avoid large data in subprocess pipe self.tfidf_objs = None self.tf_idf_data = None
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): y_ = y.copy() orig_cols = list(X.names) text_names = X[:, [str]].names self.loaded = False self.load_path = get_value(config, self.load_key) self.save_path = get_value(config, self.save_key) if self.load_path: data = joblib.load(self.load_path) self.tfidf_objs = data["tf_idf_obj"] self.tf_idf_data = data["tf_idf_data"] self.prev_params = data["params"] self.target = data["target"] self.loaded = True if not self.loaded: if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) self.tfidf_objs = {} self.tf_idf_data = {} new_X = None for col in text_names: XX = X[:, col].to_pandas() XX = XX[col].astype(str).fillna("NA").values.tolist() tfidf_vec = TfidfVectorizer(**self.return_tfidf_params()) try: tfidf_vec = self._fit_vectorizer(tfidf_vec, XX) except ValueError as e: if 'vocab' in str(e): # skip non-text-like column continue else: raise XX = tfidf_vec.transform(XX) tfidf_vec.N_ = XX.shape[0] self.tfidf_objs[col] = tfidf_vec self.tf_idf_data[col] = XX if new_X is None: new_X = XX else: new_X = sp.sparse.hstack([new_X, XX]) else: y_ = np.hstack([self.target, y_]) y = y_.copy() if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) new_X = None for col in text_names: XX = X[:, col].to_pandas() XX = XX[col].astype(str).fillna("NA").values.tolist() N_ = len(XX) tfidf_vec = TfidfVectorizer() tfidf_vec.set_params(**self.tfidf_objs[col].get_params()) try: tfidf_vec.fit(XX) new_data_avail = True except ValueError as e: if 'vocab' in str(e): # skip non-text-like column continue new_data_avail = False if new_data_avail: tfidf_vec.N_ = N_ pre_trained = self.tfidf_objs[col] pre_trained = self.sync_vectorizers(pre_trained, tfidf_vec) else: pre_trained = self.tfidf_objs[col] XX = pre_trained.transform(XX) self.tfidf_objs[col] = pre_trained XX = self.sync_tfidf(self.tf_idf_data[col], XX) self.tf_idf_data[col] = XX if new_X is None: new_X = XX else: new_X = sp.sparse.hstack([new_X, XX]) models = [LogisticRegression(**self.return_lin_params())] if self.params["add_rf"]: from h2oaicore.lightgbm_dynamic import import_lightgbm lgbm = import_lightgbm() import lightgbm as lgbm models.append(lgbm.LGBMClassifier( boosting_type='rf', colsample_bytree=.5, subsample=.632, # Standard RF bagging fraction min_child_weight=2.5, min_child_samples=5, subsample_freq=1, min_split_gain=0, n_jobs=-1, **self.return_rf_params() )) for mi, m in enumerate(models): try: m.fit(new_X, y) except ValueError as e: # general mutation as specified is not alllowed, see logistic_regression recipe. # Could use restricted choices there, but for simplicity just ignore the error entirely if mi == 0: raise IgnoreEntirelyError(str(e)) raise importances = [1] * len(orig_cols) self.set_model_properties( model={ "model": models, "tf-idfs": self.tfidf_objs }, features=orig_cols, importances=importances, iterations=0 ) if self.save_path: joblib.dump({ "tf_idf_obj": self.tfidf_objs, "tf_idf_data": self.tf_idf_data, "params": self.params, "target": y_, }, self.save_path ) # clear large objects to avoid large data in subprocess pipe self.tfidf_objs = None self.tf_idf_data = None