def test_load_feature(): df = pd.DataFrame() df['a'] = np.arange(100) with get_temp_directory() as tmp: fs.save_feature(df, 0, tmp) df_loaded = fs.load_feature(0, tmp) assert_frame_equal(df, df_loaded)
def test_load_feature_ignore_all_columns(): df = pd.DataFrame() df['a'] = np.arange(100).astype(float) df['b'] = np.arange(100).astype(int) df['c'] = np.arange(100).astype(int) with get_temp_directory() as tmp: fs.save_feature(df, 0, tmp) df_loaded = fs.load_feature(0, tmp, ignore_columns=['a', 'b', 'c', 'X']) assert_frame_equal(df_loaded, df.drop(['a', 'b', 'c'], axis=1))
def test_various_dtypes(): df = pd.DataFrame() df['a'] = np.arange(100).astype(float) df['b'] = np.arange(100).astype(int) df['c'] = np.arange(100).astype(np.uint8) df['d'] = np.arange(100).astype(np.uint16) df['e'] = np.arange(100).astype(np.uint32) df['f'] = np.arange(100).astype(np.int8) df['g'] = np.arange(100).astype(np.int16) df['h'] = np.arange(100).astype(np.int32) df['i'] = np.arange(100).astype(np.int64) with get_temp_directory() as tmp: fs.save_feature(df, 0, tmp) df_loaded = fs.load_feature(0, tmp) assert_frame_equal(df, df_loaded)
import pandas as pd from nyaggle.experiment import run_experiment from nyaggle.feature_store import load_features, load_feature from sklearn.metrics import average_precision_score from src.utils import prauc, get_folds submission = pd.read_csv("input/atmaCup5__sample_submission.csv") all_df = load_feature("all", "working") data = load_features( all_df, feature_names=[ "fitting", "peak_around", "intensity_stats", "savgol_peak", "spec_percentile", "fitting_combination", ], ignore_columns=["spectrum_id", "spectrum_filename", "chip_id"], ) train = data[data.target.notnull()].copy() test = data[data.target.isnull()].copy() target_col = "target" drop_cols = ["spectrum_id", "spectrum_filename", "chip_id"] X_train = train.drop(drop_cols + [target_col], axis=1) y_train = train[target_col] X_test = test.drop(drop_cols + [target_col], axis=1)
spec = spec.copy() spec["wave_index"] = spec.groupby("spectrum_filename").intensity.transform( lambda x: np.arange(len(x))) feat = pd.pivot(spec, index="spectrum_filename", columns="wave_index", values="intensity").ffill(axis=1) feat.columns = [f"intensity_{i:03d}" for i in range(512)] df = df.merge(feat, left_on="spectrum_filename", right_index=True) return df.iloc[:, -len(feat.columns):] if __name__ == "__main__": submission = pd.read_csv("input/atmaCup5__sample_submission.csv") train = pd.read_csv("input/train.csv") all_df = load_feature("all", "working") spec = load_feature("spec", "working") pad_spec = create_pad_spectrum(all_df, spec) # add derivative spectra # https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html spec_array = np.stack( [ pad_spec.values, scipy.signal.savgol_filter(pad_spec, 5, 2, deriv=0, axis=1), scipy.signal.savgol_filter(pad_spec, 5, 2, deriv=1, axis=1), scipy.signal.savgol_filter(pad_spec, 5, 2, deriv=2, axis=1), ], axis=1, ) # (14388, 4, 512)