class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "line_id": [], "prediction": [], } for X in pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"] ): result["line_id"] += list(X["line_id"]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score @timeit def save(self): self.config.save() @timeit def load(self): self.config.load()
def leak_detect(df: pd.DataFrame, config: Config) -> bool: if config.is_predict(): return "leak" in config id_cols = [c for c in df if c.startswith('id_')] dt_cols = [c for c in df if c.startswith('datetime_')] if id_cols and dt_cols: num_cols = [c for c in df if c.startswith('number_')] for id_col in id_cols: group = df.groupby(by=id_col).get_group(df[id_col].iloc[0]) for dt_col in dt_cols: sorted_group = group.sort_values(dt_col) for lag in range(-1, -10, -1): for col in num_cols: corr = sorted_group['target'].corr( sorted_group[col].shift(lag)) if corr >= 0.99: config["leak"] = { "num_col": col, "lag": lag, "id_col": id_col, "dt_col": dt_col, } return True return False
def feature_selection(df: pd.DataFrame, config: Config): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: return selected_columns = [] config_sample = copy.deepcopy(config) for i in range(10): df_sample = df.sample(min(1000, len(df)), random_state=i).copy(deep=True) preprocess_pipeline(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) if len(selected_columns) > 0: X = X.drop(selected_columns, axis=1) if len(X.columns) > 0: selected_columns += select_features(X, y, config["mode"]) else: break log("Selected columns: {}".format(selected_columns)) drop_number_columns = [ c for c in df if (c.startswith("number_") or c.startswith("id_")) and c not in selected_columns ] if len(drop_number_columns) > 0: config["drop_number_columns"] = drop_number_columns config["date_columns"] = {} for c in [c for c in selected_columns if c.startswith("datetime_")]: d = c.split("_") date_col = d[0] + "_" + d[1] date_part = d[2] if date_col not in config["date_columns"]: config["date_columns"][date_col] = [] config["date_columns"][date_col].append(date_part) drop_datetime_columns = [ c for c in df if c.startswith("datetime_") and c not in config["date_columns"] ] if len(drop_datetime_columns) > 0: config["drop_datetime_columns"] = drop_datetime_columns if "drop_number_columns" in config: log("Drop number columns: {}".format(config["drop_number_columns"])) df.drop(config["drop_number_columns"], axis=1, inplace=True) if "drop_datetime_columns" in config: log("Drop datetime columns: {}".format( config["drop_datetime_columns"])) df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
def read_df(csv_path: str, config: Config) -> pd.DataFrame: if "dtype" not in config: preview_df(csv_path, config) df = pandas_read_csv(csv_path, config) if config.is_train(): config["nrows"] = len(df) return df
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float = 2.0): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb > max_size_mb: mem_per_row = df_size_mb / len(df) sample_rows = int(max_size_mb / mem_per_row) log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows." .format(df_size_mb, len(df), sample_rows)) _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1) df.drop(df_drop.index, inplace=True) config["nrows"] = sample_rows else: config["nrows"] = len(df)
def non_negative_target_detect(df: pd.DataFrame, config: Config): if config.is_train(): config["non_negative_target"] = df["target"].lt(0).sum() == 0
def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir)