Пример #1
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)

        train(X, y, self.config)

    def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {
            "line_id": [],
            "prediction": [],
        }

        for X in pd.read_csv(
                test_csv,
                encoding="utf-8",
                low_memory=False,
                dtype=self.config["dtype"],
                parse_dates=self.config["parse_dates"],
                chunksize=self.config["nrows"]
        ):
            result["line_id"] += list(X["line_id"])
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
Пример #2
0
def leak_detect(df: pd.DataFrame, config: Config) -> bool:
    if config.is_predict():
        return "leak" in config

    id_cols = [c for c in df if c.startswith('id_')]
    dt_cols = [c for c in df if c.startswith('datetime_')]

    if id_cols and dt_cols:
        num_cols = [c for c in df if c.startswith('number_')]
        for id_col in id_cols:
            group = df.groupby(by=id_col).get_group(df[id_col].iloc[0])

            for dt_col in dt_cols:
                sorted_group = group.sort_values(dt_col)

                for lag in range(-1, -10, -1):
                    for col in num_cols:
                        corr = sorted_group['target'].corr(
                            sorted_group[col].shift(lag))
                        if corr >= 0.99:
                            config["leak"] = {
                                "num_col": col,
                                "lag": lag,
                                "id_col": id_col,
                                "dt_col": dt_col,
                            }
                            return True

    return False
Пример #3
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        config_sample = copy.deepcopy(config)
        for i in range(10):
            df_sample = df.sample(min(1000, len(df)),
                                  random_state=i).copy(deep=True)
            preprocess_pipeline(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(selected_columns) > 0:
                X = X.drop(selected_columns, axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

        log("Selected columns: {}".format(selected_columns))

        drop_number_columns = [
            c for c in df if (c.startswith("number_") or c.startswith("id_"))
            and c not in selected_columns
        ]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        config["date_columns"] = {}
        for c in [c for c in selected_columns if c.startswith("datetime_")]:
            d = c.split("_")
            date_col = d[0] + "_" + d[1]
            date_part = d[2]

            if date_col not in config["date_columns"]:
                config["date_columns"][date_col] = []

            config["date_columns"][date_col].append(date_part)

        drop_datetime_columns = [
            c for c in df
            if c.startswith("datetime_") and c not in config["date_columns"]
        ]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        log("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        log("Drop datetime columns: {}".format(
            config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
Пример #4
0
def read_df(csv_path: str, config: Config) -> pd.DataFrame:
    if "dtype" not in config:
        preview_df(csv_path, config)

    df = pandas_read_csv(csv_path, config)
    if config.is_train():
        config["nrows"] = len(df)

    return df
Пример #5
0
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float = 2.0):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb > max_size_mb:
            mem_per_row = df_size_mb / len(df)
            sample_rows = int(max_size_mb / mem_per_row)

            log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows."
                .format(df_size_mb, len(df), sample_rows))
            _, df_drop = train_test_split(df,
                                          train_size=sample_rows,
                                          random_state=1)
            df.drop(df_drop.index, inplace=True)

            config["nrows"] = sample_rows
        else:
            config["nrows"] = len(df)
Пример #6
0
def non_negative_target_detect(df: pd.DataFrame, config: Config):
    if config.is_train():
        config["non_negative_target"] = df["target"].lt(0).sum() == 0
Пример #7
0
 def __init__(self, model_dir: str):
     os.makedirs(model_dir, exist_ok=True)
     self.config = Config(model_dir)