def power_transformer(ss: pd.Series): """ Version of from_sklearn() dedicated to sklearn.PowerTransformer solely, only to have a good name of the function (used later in some important plots). """ transformer = PowerTransformer() ss, sname, idx = process_ss(ss) try: ss = transformer.transform(ss)[:, 0] except NotFittedError: print(f"! fitting parameters for PowerTransformer on variable {sname}") ss = transformer.fit_transform(ss)[:, 0] ss = pd.Series(ss, index=idx) ss.name = sname t_name = transformer.get_params()['method'].title().replace("-", "") + \ "_{{\\lambda = {}}}".format(round(transformer.lambdas_[0], 2)) return ss, from_sklearn(transformer, t_name)
def power_transform(): path = Path.cwd() cleaned_data_path = path.parent / 'data' / 'cleaned' df = {} lmbda = { 'austria': {}, 'belgium': {}, 'germany': {}, 'italy': {}, 'netherlands': {} } for country in COUNTRIES: # read file file_path = cleaned_data_path / (country + '.csv') df[country] = pd.read_csv(file_path) process.add_polynomial_features(country, df, 10) numerical_features = df[country].select_dtypes( exclude=["object"]).columns numerical_features = numerical_features.drop('incidence') skewness = df[country][numerical_features].apply(lambda x: skew(x)) skewness = skewness[abs(skewness) > 0.5] skewed_features = skewness.index process.hot_encode_weeks(country, df) train = df[country].sample(frac=0.8, random_state=200) test = df[country].drop(train.index) train = train.sort_values(by="date") test = test.sort_values(by="date") train = train.drop(columns=['week', 'date']) test = test.drop(columns=['week', 'date']) y_train = pd.Series(train['incidence']).to_frame('incidence') y_test = pd.Series(test['incidence']).to_frame('incidence') X_train = train.drop(columns=['incidence']) X_test = test.drop(columns=['incidence']) pt = PowerTransformer() pt.fit_transform(X_train, y_train.values) params = pt.get_params() pt.fit_transform(X_test) means = {} std_deviations = {} process.train_std_normal(X_train, numerical_features, means, std_deviations) process.apply_std_normal(X_test, numerical_features, means, std_deviations) test_data_path = path.parent / 'data' / 'test' / country x_train_file_path = test_data_path / 'X_train.csv' y_train_file_path = test_data_path / 'y_train.csv' x_test_file_path = test_data_path / 'X_test.csv' y_test_file_path = test_data_path / 'y_test.csv' X_train.to_csv(x_train_file_path, index=False) y_train.to_csv(y_train_file_path, index=False) X_test.to_csv(x_test_file_path, index=False) y_test.to_csv(y_test_file_path, index=False)