def transform_datetime(df: pd.DataFrame, config: Config): date_parts = ["year", "weekday", "month", "day", "hour"] if "date_columns" not in config: config["date_columns"] = {} for c in [c for c in df if c.startswith("datetime_")]: config["date_columns"][c] = [] for part in date_parts: part_col = c + "_" + part df[part_col] = getattr(df[c].dt, part).astype( np.uint16 if part == "year" else np.uint8).values if not (df[part_col] != df[part_col].iloc[0]).any(): log(part_col + " is constant") df.drop(part_col, axis=1, inplace=True) else: config["date_columns"][c].append(part) df.drop(c, axis=1, inplace=True) else: for c, parts in config["date_columns"].items(): for part in parts: part_col = c + "_" + part df[part_col] = getattr(df[c].dt, part) df.drop(c, axis=1, inplace=True)
def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config): X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) space = { "learning_rate": hp.uniform("learning_rate", 0.01, 0.05), "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6]), "num_leaves": hp.choice("num_leaves", np.linspace(10, 200, 50, dtype=int)), "feature_fraction": hp.quniform("feature_fraction", 0.5, 1.0, 0.1), "bagging_fraction": hp.quniform("bagging_fraction", 0.5, 1.0, 0.1), "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 50, 10, dtype=int)), "reg_alpha": hp.uniform("reg_alpha", 0, 30), "reg_lambda": hp.uniform("reg_lambda", 0, 30), "min_child_weight": hp.uniform('min_child_weight', 0.5, 10), } def objective(hyperparams): model = lgb.train({**params, **hyperparams}, train_data, 300, valid_data, early_stopping_rounds=100, verbose_eval=100) score = model.best_score["valid_0"][params["metric"]] if config.is_classification(): score = -score return {'loss': score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=50, verbose=1, rstate=np.random.RandomState(1)) hyperparams = space_eval(space, best) log("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams)) return hyperparams
def drop_constant_columns(df: pd.DataFrame, config: Config): if "constant_columns" not in config: config["constant_columns"] = [ c for c in df if c.startswith("number_") and not (df[c] != df[c].iloc[0]).any() ] log("Constant columns: " + ", ".join(config["constant_columns"])) if len(config["constant_columns"]) > 0: df.drop(config["constant_columns"], axis=1, inplace=True)
def validate_dataset(alias: str, mode: str, train_limit: int) -> np.float64: log(alias) automl = AutoML("models/check_{}".format(alias)) automl.config["time_limit"] = train_limit automl.train("data/check_{}/train.csv".format(alias), mode) automl.config["time_limit"] = 300 _, score = automl.predict("data/check_{}/test.csv".format(alias), "predictions/check_{}.csv".format(alias)) return score
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float = 2.0): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb > max_size_mb: mem_per_row = df_size_mb / len(df) sample_rows = int(max_size_mb / mem_per_row) log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows." .format(df_size_mb, len(df), sample_rows)) _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1) df.drop(df_drop.index, inplace=True) config["nrows"] = sample_rows else: config["nrows"] = len(df)
def to_int8(df: pd.DataFrame, config: Config): if "int8_columns" not in config: config["int8_columns"] = [] vals = [-1, 0, 1] for c in [c for c in df if c.startswith("number_")]: if (~df[c].isin(vals)).any(): continue config["int8_columns"].append(c) log(config["int8_columns"]) if len(config["int8_columns"]) > 0: df.loc[:, config["int8_columns"]] = df.loc[:, config["int8_columns"]].astype( np.int8)
def transform_categorical(df: pd.DataFrame, config: Config): if "categorical_columns" not in config: prior = config["categorical_prior"] = df["target"].mean() min_samples_leaf = int(0.01 * len(df)) smoothing = 0.5 * min_samples_leaf config["categorical_columns"] = {} for c in [c for c in df if c.startswith("string_")]: averages = df[[c, "target" ]].groupby(c)["target"].agg(["mean", "count"]) smooth = 1 / (1 + np.exp( -(averages["count"] - min_samples_leaf) / smoothing)) averages["target"] = prior * (1 - smooth) + averages["mean"] * smooth config["categorical_columns"][c] = averages["target"].to_dict() log(list(config["categorical_columns"].keys())) for c, values in config["categorical_columns"].items(): df.loc[:, c] = df[c].apply(lambda x: values[x] if x in values else config["categorical_prior"])
def feature_selection(df: pd.DataFrame, config: Config): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: return selected_columns = [] config_sample = copy.deepcopy(config) for i in range(10): df_sample = df.sample(min(1000, len(df)), random_state=i).copy(deep=True) preprocess_pipeline(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) if len(selected_columns) > 0: X = X.drop(selected_columns, axis=1) if len(X.columns) > 0: selected_columns += select_features(X, y, config["mode"]) else: break log("Selected columns: {}".format(selected_columns)) drop_number_columns = [ c for c in df if (c.startswith("number_") or c.startswith("id_")) and c not in selected_columns ] if len(drop_number_columns) > 0: config["drop_number_columns"] = drop_number_columns config["date_columns"] = {} for c in [c for c in selected_columns if c.startswith("datetime_")]: d = c.split("_") date_col = d[0] + "_" + d[1] date_part = d[2] if date_col not in config["date_columns"]: config["date_columns"][date_col] = [] config["date_columns"][date_col].append(date_part) drop_datetime_columns = [ c for c in df if c.startswith("datetime_") and c not in config["date_columns"] ] if len(drop_datetime_columns) > 0: config["drop_datetime_columns"] = drop_datetime_columns if "drop_number_columns" in config: log("Drop number columns: {}".format(config["drop_number_columns"])) df.drop(config["drop_number_columns"], axis=1, inplace=True) if "drop_datetime_columns" in config: log("Drop datetime columns: {}".format( config["drop_datetime_columns"])) df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
def preview_df(train_csv: str, config: Config, nrows: int = 3000): num_rows = sum(1 for line in open(train_csv)) - 1 log("Rows in train: {}".format(num_rows)) df = pd.read_csv(train_csv, encoding="utf-8", low_memory=False, nrows=nrows) mem_per_row = df.memory_usage(deep=True).sum() / nrows log("Memory per row: {:0.2f} Kb".format(mem_per_row / 1024)) df_size = (num_rows * mem_per_row) / 1024 / 1024 log("Approximate dataset size: {:0.2f} Mb".format(df_size)) config["parse_dates"] = [] config["dtype"] = { "line_id": int, } counters = { "id": 0, "number": 0, "string": 0, "datetime": 0, } for c in df: if c.startswith("number_"): counters["number"] += 1 elif c.startswith("string_"): counters["string"] += 1 config["dtype"][c] = str elif c.startswith("datetime_"): counters["datetime"] += 1 config["dtype"][c] = str config["parse_dates"].append(c) elif c.startswith("id_"): counters["id"] += 1 log("Number columns: {}".format(counters["number"])) log("String columns: {}".format(counters["string"])) log("Datetime columns: {}".format(counters["datetime"])) config["counters"] = counters
def validate(preds: pd.DataFrame, target_csv: str, mode: str) -> np.float64: df = pd.merge(preds, pd.read_csv(target_csv), on="line_id", left_index=True) score = roc_auc_score(df.target.values, df.prediction.values) if mode == "classification" else \ np.sqrt(mean_squared_error(df.target.values, df.prediction.values)) log("Score: {:0.4f}".format(score)) return score