def transform(self, fp): fm = FeaturePool(fp).meta() x = FeaturePool(fp).array() scaler = MinMaxScaler(feature_range = self.feature_range) scaler.fit(x) for f in FeaturePool.from_array(fm, scaler.transform(x)): yield f
def transform(self, fp): fm, train_x, train_y = FeaturePool.to_train_arrays(fp) os = SMOTE(random_state = self.random_state) os_train_x, os_train_y = os.fit_sample(train_x, train_y[:, 0]) os_train_y = os_train_y.reshape((os_train_y.shape[0], 1)) for f in FeaturePool.from_train_arrays(fm, os_train_x, os_train_y): yield Feature.apply_config(f, is_over_sampled=True) for f in fp: if f.split_type == SplitType.TEST: yield f
def fit_model(self, fp): fp = FeaturePool(fp) p = ( fp.train_split() .predictors() .filter(lambda f: f.categorical) ) x = p.array() y = ( fp.train_split() .targets() .array() ) return self._inst.fit(x, y)
def transform(self, fp): x = FeaturePool(fp).array() logger.info("TUmap: starting UMAP transform ...") x_emb = self._inst.fit_transform(x) logger.info("TUamp: Done") for f_id in range(x_emb.shape[1]): yield Feature( "UMAP feature #{}".format(f_id), x_emb[:, f_id] )
def fit_model(self, fp): fm, train_x, train_y = FeaturePool.to_train_arrays(fp) train_fm = fm.predictors() m = self._inst.fit(train_x, train_y.ravel()) assert len(train_fm) == len(m.support_), \ "Size of output of RFE does not equals to the metadata {} != {}".format( len(train_fm), len(m.support_) ) return m
def transform(self, fp): m = self.fit_model(fp) supp = set([ f.name for f, support in zip(FeaturePool(fp).train_predictors(), m.support_) if support ]) for f in fp: if f.is_predictor: if f.name in supp: yield f else: logger.info("TFeatureElimination: eliminating feature `{}`".format(f.name)) else: yield f
def transform(self, fp): train_a, test_a = train_test_split( FeaturePool(fp).array(), test_size = self.test_size, random_state = self.random_state, ) for f_id, f in enumerate(fp): yield Feature.apply_config( Feature(f.name, train_a[:, f_id], f.st), split_type=SplitType.TRAIN ) for f_id, f in enumerate(fp): yield Feature.apply_config( Feature(f.name, test_a[:, f_id], f.st), split_type=SplitType.TEST )
def plot_embedding(efp: FeaturePool, split_by=None): x = efp.array() assert x.shape[1] == 2, "Embedding is expected to be with the size 2 to plot, got {}".format(x.shape[1]) fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(111) if split_by is not None: d = split_by.data ax.scatter(x[:, 0], x[:, 1], c=d, alpha=0.5) else: ax.scatter(x[:, 0], x[:, 1], alpha=0.5) if split_by is not None: ax.set_title( "UMAP for a feature pool splitted by feature `{}`".format(split_by.name) ) else: ax.set_title( "UMAP for a feature pool" ) fig.show()
def run(self, d): while len(self.ops) > 0: op = self.ops.popleft() if isinstance(op, Transform): assert isinstance(d, FeaturePool), \ "Expecting `FeaturePool`, got {}".format(d) d = FeaturePool([f for f in op(d.features)]) elif isinstance(op, Model): assert isinstance(d, FeaturePool), \ "Expecting `FeaturePool`, got {}".format(d) d = op(d) elif isinstance(op, Validation): assert isinstance(d, Model.Output), \ "Expecting Model.Output for validation, got {}".format(d) d = op(d) else: raise ValueError( "Failed to dispatch operation: `{}`".format(op)) return d
def transform(self, fp): self.callback(FeaturePool(fp)) for f in fp: yield f
elif data["tenure"] > 60: return "Tenure_gt_60" telcom["tenure_group"] = telcom.apply(lambda telcom: tenure_lab(telcom), axis = 1) # telcom = telcom.drop("tenure", axis=1) # # telcom = telcom.drop("TotalCharges", axis=1) # telcom = telcom.drop("customerID", axis=1) # telcom.TotalCharges = pd.to_numeric(telcom.TotalCharges, errors='coerce') # telcom['Churn'].replace(to_replace='Yes', value=1, inplace=True) # telcom['Churn'].replace(to_replace='No', value=0, inplace=True) # telcom = pd.get_dummies(telcom) # telcom.dropna(inplace = True) fp = FeaturePool.from_dataframe(telcom) seed = 5 clean = Pipeline( TParse(), TCleanPool(), TSummary(), ) te = Pipeline( TPreprocessPool(), TSummary(), TCleanRedundantFeatures(correlation_bound=0.99), )
return "Tenure_gt_60" telcom["tenure_group"] = telcom.apply(lambda telcom: tenure_lab(telcom), axis=1) # telcom = telcom.drop("tenure", axis=1) # # telcom = telcom.drop("TotalCharges", axis=1) # telcom = telcom.drop("customerID", axis=1) # telcom.TotalCharges = pd.to_numeric(telcom.TotalCharges, errors='coerce') # telcom['Churn'].replace(to_replace='Yes', value=1, inplace=True) # telcom['Churn'].replace(to_replace='No', value=0, inplace=True) # telcom = pd.get_dummies(telcom) # telcom.dropna(inplace = True) fp = FeaturePool.from_dataframe(telcom) seed = 5 clean = Pipeline( TParse(), TCleanPool(), TSummary(), ) te = Pipeline( TPreprocessPool(), TSummary(), TCleanRedundantFeatures(correlation_bound=0.99), )
def __init__(self, models, fp): self.models = models self.fp = FeaturePool(fp)