def fit_transform(self, X: dt.Frame, y: np.array = None): XX = X.to_pandas().iloc[:, 0].values is_na = np.isnan(XX) self._offset = -np.nanmin(XX) if np.nanmin(XX) < 0 else 0 self._offset += 1e-3 self._lmbda = None if not any(~is_na): return X self._lmbda = yeojohnson(self._offset + XX[~is_na], lmbda=self._lmbda)[1] # compute lambda return self.transform(X)
def transform(self, X: dt.Frame): XX = X.to_pandas().iloc[:, 0].values is_na = np.isnan(XX) | np.array(XX <= -self._offset) if not any(~is_na) or self._lmbda is None: return X ret = yeojohnson( self._offset + XX[~is_na], lmbda=self._lmbda) # apply transform with pre-computed lambda XX[~is_na] = ret return XX
def transform(self, X: dt.Frame): orig_col_name = X.names[0] X = dt.Frame(X).to_pandas().astype(str).fillna("NA") new_X = X.apply(lambda x: self.get_ne_count(x[orig_col_name]), axis=1, result_type='expand') new_X.columns = [ f'{orig_col_name}_Count_{ne_type}' for ne_type in self.ne_types ] return new_X
def transform(self, X: dt.Frame): import pandas as pd ret_df = pd.DataFrame( [self.get_imports_features(x) for x in X.to_pandas().values[:, 0]]) self._output_feature_names = ret_df.columns.to_list() self._feature_desc = self._output_feature_names return ret_df
def transform(self, X: dt.Frame, y: np.array = None): if ngpus_vis == 0: raise IgnoreEntirelyError("Transformer cannot run without GPUs") import cudf import cuml cuml.common.memory_utils.set_global_output_type('numpy') X = X.to_pandas().fillna(0) X = cudf.DataFrame(X) return self.model.predict(X)
def fit(self, X: dt.Frame, y: np.array = None): """ Fits ARIMA models (1 per time group) using historical target values contained in y :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Import the ARIMA python module pm = importlib.import_module('pmdarima') # Create dictionary that will link models to groups self.models = {} # Convert to pandas X = X.to_pandas() # Keep the Time Group Columns XX = X[self.tgc].copy() # Add the target XX['y'] = np.array(y) self.mean_value = np.mean(y) self.ntrain = X.shape[0] # Get the logger if it exists logger = self._get_logger() # Group the input by TGC (Time group column) excluding the time column itself # What we want is being able to access the time series related to each group # So that we can predict future sales for each store/department independently tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Build 1 ARIMA model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # print("auto arima - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) order = np.argsort(X[self.time_column]) try: model = pm.auto_arima(X['y'].values[order], error_action='ignore') except Exception as e: loggerinfo(logger, "Auto ARIMA warning: {}".format(e)) model = None self.models[grp_hash] = model return self
def transform(self, X: dt.Frame): """ Uses fitted models (1 per time group) to predict the target If self.is_train exists, it means we are doing in-sample predictions if it does not then we Arima is used to predict the future :param X: Datatable Frame containing the features :return: ARIMA predictions """ X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) nb_groups = len(XX_grp) preds = [] for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # print("auto arima - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) order = np.argsort(X[self.time_column]) if grp_hash in self.models: model = self.models[grp_hash] if model is not None: yhat = model.predict_in_sample() \ if hasattr(self, 'is_train') else model.predict(n_periods=X.shape[0]) yhat = yhat[order] XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # invalid model else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups XX.index = X.index preds.append(XX) XX = pd.concat(tuple(preds), axis=0).sort_index() return XX
def write_pset_table(pset_df, df_name, pset_name, df_dir): """ Write a PSet table to a CSV file. @param pset_df: [`DataFrame`] A PSet DataFrame @param pset_name: [`string`] The name of the PSet @param df_dir: [`string`] The name of the directory to hold all the PSet tables @return [`None`] """ pset_path = os.path.join(df_dir, pset_name) # Make sure directory for this PSet exists if not os.path.exists(pset_path): os.mkdir(pset_path) # Convert to datatable Frame for fast write to disk pset_df = Frame(pset_df) print(f'Writing {df_name} table to {pset_path}...') # Use datatable to convert df to csv pset_df.to_csv(os.path.join(pset_path, f'{pset_name}_{df_name}.csv'))
def create_data(X: dt.Frame = None): if X is None: return [] fixup = process_tweets() X = dt.Frame(X).to_pandas() for text_colname in text_colnames: X["preprocessed_" + text_colname] = X[text_colname].astype( str).apply(lambda x: fixup.preprocess(x)) temp_path = os.path.join(config.data_directory, config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) #Save files to disk file_train = os.path.join(temp_path, output_dataset_name + ".csv") X.to_csv(file_train, index=False) return [file_train]
def transform(self, X: dt.Frame): col_names = X.names print(col_names) lat = [] long = [] for col in col_names: if col.find("latitude") > -1: lat.append(col) if (col.find("longitude") > -1): long.append(col) if (len(lat) == 2 and len(long) == 2): return X.to_pandas().apply(lambda row: \ distance(row[lat[0]], \ row[long[0]], \ row[lat[1]], \ row[long[1]]), \ axis=1) else: return X.to_pandas().iloc[:, 0]
def fit_transform(self, X: dt.Frame, y: np.array = None): X = X.to_pandas().astype(str).iloc[:, 0].fillna("NA") # Count Vectorizer self.cnt_vec = CountVectorizer(analyzer="char", ngram_range=(1,self.max_ngram)) X = self.cnt_vec.fit_transform(X) # Truncated SVD if len(self.cnt_vec.vocabulary_) <= self.n_svd_comp: self.n_svd_comp = len(self.cnt_vec.vocabulary_) - 1 self.truncated_svd = TruncatedSVD(n_components=self.n_svd_comp, random_state=2019) X = self.truncated_svd.fit_transform(X) return X
def transform(self, X: dt.Frame): # Keep date only X = X[:, self.time_column].to_pandas() # Transform to pandas date time X[self.time_column] = pd.to_datetime(X[self.time_column]) # Create Year and day of year so that we can merge with stored holidays X['year'] = X[self.time_column].dt.year X['doy'] = X[self.time_column].dt.dayofyear # General first holi_df = self.memos['country'] holi_df['is_DE_holiday_country'] = 1 X["is_DE_holiday_country"] = X.merge( self.memos['country'], on=['year', 'doy'], how='left').fillna(0)['is_DE_holiday_country'] # Then Landers for prov in [ 'BW', 'BY', 'BE', 'BB', 'HB', 'HH', 'HE', 'MV', 'NI', 'NW', 'RP', 'SL', 'SN', 'ST', 'SH', 'TH' ]: holi_df = self.memos[prov] holi_df[f'is_DE_holiday_{prov}'] = 1 X[f'is_DE_holiday_{prov}'] = X.merge( holi_df, on=['year', 'doy'], how='left').fillna(0)[f'is_DE_holiday_{prov}'] X.drop([self.time_column, 'year', 'doy'], axis=1, inplace=True) features = [ f'is_DE_holiday%s{prov}' % (orig_feat_prefix + orig_feat_prefix.join([self.time_column]) + extra_prefix) for prov in [ 'country', 'BW', 'BY', 'BE', 'BB', 'HB', 'HH', 'HE', 'MV', 'NI', 'NW', 'RP', 'SL', 'SN', 'ST', 'SH', 'TH' ] ] self._output_feature_names = list(features) self._feature_desc = list(features) return X
def fit(self, X: dt.Frame, y: np.array = None): """ Fits ARIMA models (1 per time group) using historical target values contained in y :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Import the ARIMA python module pm = importlib.import_module('pmdarima') # Init models self.models = {} # Convert to pandas X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Build 1 ARIMA model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # print("auto arima - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) order = np.argsort(X[self.time_column]) try: model = pm.auto_arima(X['y'].values[order], error_action='ignore') except: model = None self.models[grp_hash] = model return self
def transform(self, X: dt.Frame): h2o.init() model_path = os.path.join(temporary_files_path, self.id) with open(model_path, "wb") as f: f.write(self.raw_model_bytes) model = h2o.load_model(model_path) os.remove(model_path) frame = h2o.H2OFrame(X.to_pandas()) try: return model.anomaly(frame).as_data_frame(header=False) finally: h2o.remove(self.id)
def transform(self, X: dt.Frame): X = dt.Frame(X) orig_col_name = X.names[0] new_X = X.to_pandas().astype(str).fillna("NA").iloc[:, 0].values new_X = [doc.split() for doc in new_X] new_X = [self.dictionary.doc2bow(doc) for doc in new_X] new_X = self.model.inference(new_X)[0] self._output_feature_names = [f'{self.display_name}{orig_feat_prefix}{orig_col_name}{extra_prefix}topic{i}' for i in range(new_X.shape[1])] self._feature_desc = [f'LDA Topic {i} of {self.n_topics} for {orig_col_name} column' for i in range(new_X.shape[1])] return new_X
def transform(self, X: dt.Frame): X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): if (_i_g + 1) % max(1, nb_groups // 20) == 0: print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Transformed") key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) X_path = os.path.join(temporary_files_path, "autoarima_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join(temporary_files_path, "autoarima_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column,) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: # Don't go through pools XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups # Sync indices XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) return XX
def transform(self, X: dt.Frame): XX = X.to_pandas().iloc[:, 0].values is_na = np.isnan(XX) | np.array(XX <= -self._offset) if not any(~is_na) or self._lmbda is None: return X x = self._offset + XX[~is_na] x = np.asarray(x) x[x <= 0] = 1e-3 # don't worry if not invertible, just ensure can transform and valid transforms are kept valid ret = boxcox( x, lmbda=self._lmbda) # apply transform with pre-computed lambda XX[~is_na] = ret return XX
def transform(self, X: dt.Frame): import pandas as pd orig_col_name = X.names[0] ret_df = pd.DataFrame( [ self.get_norm_byte_count(x) for x in X.to_pandas().values[:,0] ] ) self._output_feature_names = ['ByteNormCount_{}'.format(x) for x in range(ret_df.shape[1])] self._feature_desc = [f'Normalized Count of Byte value {x} for {orig_col_name} column' for x in range(ret_df.shape[1])] return ret_df
def fit(self, X: dt.Frame, y: np.array = None): pm = importlib.import_module('pmdarima') self.models = {} X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): if (_i_g + 1) % max(1, nb_groups // 20) == 0: print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Fitted") X_path = os.path.join(temporary_files_path, "autoarima_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash, self.time_column,) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) return self
def create_data(X: dt.Frame = None) -> pd.DataFrame: if X is None: return [] from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS X = X.to_pandas() y = X[TARGET_COLUMN].values X.drop(TARGET_COLUMN, axis=1, inplace=True) efs = EFS(ESTIMATOR, min_features=MIN_FEATURES, max_features=MAX_FEATURES, scoring=SCORING, cv=CV, n_jobs=-1) efs.fit(X, y) X_fs = X.iloc[:, list(efs.best_idx_)] return X_fs
def transform(self, X: dt.Frame): """Transform features once grouped by Time Group Columns (TGC)""" # With the col_type set to "all", X can contain text features # So restrict to int float and bool types # This is easily done in datatable X = X[:, [int, float, bool]] # If after the filtering there are no features left then just return a zero valued features if X.ncols == 0: return np.zeros((X.nrows, 1)) # Move to pandas to use the apply method X = X.to_pandas() group_cols = [_f for _f in self.tgc if _f != self.time_column] # Check if we really have any group columns available if len(group_cols) == 0: # Apply MACD directly on the available features but drop the time column features = [_f for _f in X.columns if _f != self.time_column] return self.normalized_macd(X[features]) # Get the data columns, i.e. not the group columns or time column col = np.setdiff1d(X.columns, self.tgc) if len(col) > 0: # Groupby by the TGC and apply normalized MACD to the data # Pandas.apply ios not time effective so should move this to data table try: res = X.groupby(group_cols)[col].apply(self.normalized_macd) except KeyError: return np.zeros((X.nrows, 1)) res.index = X.index if res.shape[1] == 0: return np.zeros((X.nrows, 1)) else: return res else: return np.zeros((X.nrows, 1))
def create_data(X: dt.Frame = None) -> pd.DataFrame: if X is None: return [] from mlxtend.feature_selection import SequentialFeatureSelector as SFS X = X.to_pandas() y = X[TARGET_COLUMN].values X.drop(TARGET_COLUMN, axis=1, inplace=True) sfs = SFS(ESTIMATOR, k_features=K_FEATURES, forward=False, floating=False, scoring=SCORING, cv=CV, n_jobs=-1) sfs.fit(X, y) X_fs = X.iloc[:, list(sfs.k_feature_idx_)] return X_fs
def transform(self, X: dt.Frame): XX = X.to_pandas().iloc[:, 0].values is_na = np.isnan(XX) | np.array(XX <= -self._offset) if not any(~is_na) or self._lmbda is None: return X ret = yeojohnson( self._offset + XX[~is_na], lmbda=self._lmbda) # apply transform with pre-computed lambda XX[~is_na] = ret XX = dt.Frame(XX) # Don't leave inf/-inf for i in range(XX.ncols): XX.replace([math.inf, -math.inf], None) return XX
def fit_transform(self, X: dt.Frame, y: np.array = None): if ngpus_vis == 0: raise IgnoreEntirelyError("Transformer cannot run without GPUs") import cudf import cuml cuml.common.memory_utils.set_global_output_type('numpy') self.n_clusters = min(self.n_clusters, X.nrows) self.model = cuml.cluster.KMeans(n_clusters=self.n_clusters, max_iter=self.max_iters, tol=self.tol) X = X.to_pandas().fillna(0) X = cudf.DataFrame(X) return self.model.fit_predict(X)
def transform(self, X: dt.Frame): import pandas as pd mels = X.to_pandas().iloc[:, 0].apply(lambda x: self.get_mfcc(x)) col_names = ['X_' + str(i) for i in range(0, len(mels[0]))] rows = len(mels) cols = len(mels[0]) output_df = pd.DataFrame(data=np.reshape(np.concatenate(mels), (rows, cols)), columns=col_names) return output_df
def transform(self, X: dt.Frame): output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = set(str(text1).lower().split()) text2 = text2_arr[ind] text2 = set(str(text2).lower().split()) output.append(len(text1.intersection(text2)) / len(text1.union(text2))) except: output.append(-1) return np.array(output)
def _do_ale_per_feature(self, X: dt.Frame, features: list) -> dict: ale_per_feature = dict() for feature in features: try: ale_per_feature[feature] = self._do_ale( X=X.to_pandas(), feature=feature, bins=self.cfg_feature_bins.get(feature, self.cfg_bins), ) except Exception as ex: self.logger.warning(f"ALE: skipping feature {feature}") self.logger.debug( f"ALE: skipping feature {feature} as it failed with: {ex}") return ale_per_feature
def fit_transform(self, X: dt.Frame, y: np.array = None): import gensim from gensim import corpora X = dt.Frame(X) new_X = X.to_pandas().astype(str).fillna("NA").iloc[:, 0].values new_X = [doc.split() for doc in new_X] self.dictionary = corpora.Dictionary(new_X) new_X = [self.dictionary.doc2bow(doc) for doc in new_X] self.model = gensim.models.ldamodel.LdaModel(new_X, num_topics=self.n_topics, id2word=self.dictionary, passes=10, random_state=2019) return self.transform(X)
def fit(self, X: dt.Frame, y: np.array = None): XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) XX['y'] = np.array(y) self.nan_value = np.mean( y) # TODO - store mean per group, not just global tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] self.models = {} num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): if (_i_g + 1) % max(1, nb_groups // 20) == 0: print(100 * (_i_g + 1) // nb_groups, " of Groups Fitted") X_path = os.path.join(temporary_files_path, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = ( X_path, grp_hash, ) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) return self
def create_data(X: dt.Frame = None) -> Union[ str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: col_count = 2 col_names = ["random_col_1", "random_col_2"] if col_count != len(col_names): raise ValueError("Number of column names must be equal to number of columns.") if X is None: return [] rcol = dt.Frame(np.random.randint(0, 100, size=(X.shape[0], col_count))) rcol.names = col_names X.cbind(rcol) return X