示例#1
0
class ModelGenerator(object):
    def __init__(self, config_path):
        self.config = open_json(config_path)
        self.logger = Logger()

        self.models = self.build_model()

    def build_model(self):
        self.logger.log(
            f" - Building a model [ {self.config['model']['MODEL']} ]")

        model_Lasso = make_pipeline(RobustScaler(),
                                    Lasso(alpha=0.000327, random_state=18))

        model_ENet = make_pipeline(
            RobustScaler(),
            ElasticNet(alpha=0.00052, l1_ratio=0.70654, random_state=18))

        model_GBoost = GradientBoostingRegressor(
            n_estimators=3000,
            learning_rate=0.05,
            max_depth=4,
            max_features="sqrt",
            min_samples_leaf=15,
            min_samples_split=10,
            loss="huber",
            random_state=18,
        )

        model_XGB = XGBRegressor(
            colsample_bylevel=0.9229733609038979,
            colsample_bynode=0.21481791874780318,
            colsample_bytree=0.607964318297635,
            gamma=0.8989889254961725,
            learning_rate=0.009192310189734834,
            max_depth=3,
            n_estimators=3602,
            reg_alpha=3.185674564163364e-12,
            reg_lambda=4.95553539265423e-13,
            seed=18,
            subsample=0.8381904293270576,
            verbosity=0,
        )

        model_logistic = LogisticRegression()

        models = {
            "Lasso": model_Lasso,
            "ENet": model_ENet,
            "GBoost": model_GBoost,
            "XGBoost": model_XGB,
            "LogReg": model_logistic,
        }

        return models

    def fit_model(self, dataset, metaset):
        dataset["valid"] = dataset["train"][:45569]
        dataset["train"] = dataset["train"][45569:]

        train_label = dataset["train"][metaset["__target__"]]
        train_value = dataset["train"].drop(columns=metaset["__target__"])

        valid_label = dataset["valid"][metaset["__target__"]]
        valid_value = dataset["valid"].drop(columns=metaset["__target__"])

        predicts = dict()
        models = self.models

        def fitting(model, x_train, x_test, y_train, y_test):
            model.fit(x_train, y_train)
            y_pred = model.predict(x_test)
            self.metrics(y_test, y_pred)
            return y_pred

        print("FIT - LogReg")
        predicts["LogReg"] = fitting(
            model=models["LogReg"],
            x_train=train_value,
            x_test=valid_value,
            y_train=train_label,
            y_test=valid_label,
        )

        # log_train_predict = (
        #     predicts["Lasso"]
        #     + predicts["ENet"]
        #     + predicts["GBoost"]
        #     + predicts["XGBoost"]
        # ) / 4

        # train_score = mean_squared_error(train_label, log_train_predict)
        # print(f"Scoring with train data : {train_score}")

    def metrics(self, y_test, y_pred):
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_score = roc_auc_score(y_test, y_pred, average="macro")
        print(
            f"accr : {accuracy:.2f}, prec : {precision:.2f}, recall : {recall:.2f}"
        )
        print(f"f1   : {f1:.2f},  auc : {roc_score:.2f}")
示例#2
0
class DataAnalyzer(object):
    def __init__(self, config_path, dataset, metaset):
        self.config = open_json(config_path)

        self.logger = Logger()
        self.eda = EDA(self.config["analyzer"])
        self.dataset = dataset
        self.metaset = metaset

    def analize(self):
        dataset = self.dataset
        metaset = self.metaset

        pd.set_option("display.max_columns", metaset["__ncolumns__"])
        pd.set_option("display.width", 1000)

        self.logger.log(
            f"DATASET Analysis \n"
            f" Total Train dataset : {metaset['__nrows__']['train']} \n"
            f" Total Test  dataset : {metaset['__nrows__']['test']} \n"
            f" Total Columns num   : {metaset['__ncolumns__']}  \n"
            f" Target label        : {metaset['__target__']} \n"
            f" Target dtype        : {dataset['train'][metaset['__target__']].dtype} \n"
        )

        self.eda.countplot(
            dataframe=dataset["train"],
            column=metaset["__target__"],
            title="Target Label Distributions",
        )

        request_user_input()

        self.analize_dtype()  # 2.1
        self.analize_feature()

    def analize_dtype(self):
        self.logger.log(" - 2.1 Analize Dtype", level=2)

        # SHOW INFO
        print(get_meta_info(self.metaset, self.dataset))

        # USER COMMAND
        answer = ask_boolean("Are there any issues that need to be corrected?")

        while answer:
            target_index = request_user_input(
                f"Please enter the index of the target to be modified.",
                valid_inputs=range(self.metaset["__ncolumns__"]),
                skipable=True,
                default=None,
            )

            if target_index is None:
                break

            target_col = self.metaset["__columns__"][int(target_index)]
            self.convert_dtype(target_col)

            print(get_meta_info(self.metaset, self.dataset))
            answer = ask_boolean(
                "Are there any issues that need to be corrected?")

    def analize_dataset(self):
        metaset = self.metaset
        dataset = self.dataset

        self.logger.log(
            f"DATASET Analysis \n"
            f" Total Train dataset : {metaset['__nrows__']['train']} \n"
            f" Total Test  dataset : {metaset['__nrows__']['test']} \n"
            f" Total Columns num   : {metaset['__ncolumns__']}  \n"
            f" Target label        : {metaset['__target__']} \n"
            f"  [train distribute(percent.)]\n{metaset['__distribution__']['train']} \n"
            f"  [test  distribute(percent.)]\n{metaset['__distribution__']['test']} \n"
        )

        request_user_input()

        for i, col in enumerate(metaset["__columns__"]):
            col_meta = metaset[col]
            self.logger.log(f"{col_meta['index']:3d} "
                            f"{col_meta['name']:20} "
                            f"{col_meta['dtype']:10} "
                            f"{col_meta['descript']}")

        answer = ask_boolean("Are there any issues that need to be corrected?")
        self.config["options"]["FIX_COLUMN_INFO"] = answer

        if self.config["options"]["FIX_COLUMN_INFO"] is True:
            self.analize_feature()

    def analize_feature(self):
        self.logger.log("- 2.2 : Check Data Features", level=2)

        for i, col in enumerate(self.metaset["__columns__"]):
            col_meta = self.metaset[col]
            col_data = self.dataset["train"][col]
            show_col_info(col_meta, col_data)
            answer = ask_boolean(
                "Are there any issues that need to be corrected?", default="N")

            while answer:
                target = request_user_input(
                    f"Please enter issue [none, dtype]",
                    valid_inputs=["dtype"],
                    skipable=True,
                    default=None,
                )

                if target == "Dtype":
                    self.convert_dtype(col)
                show_col_info(col_meta, col_data)
                answer = ask_boolean(
                    "Are there any issues that need to be corrected?",
                    default="N")

        print(get_meta_info(self.metaset, self.dataset))

        return self.dataset

    def convert_dtype(self, col):
        right_dtype = request_user_input(
            f"Please enter right dtype [num-int, num-float, bool, category, datetime]",
            valid_inputs=[
                "num-int", "num-float", "bool", "category", "datetime"
            ],
            skipable=True,
            default=None,
        )

        print(f"you select dtype {right_dtype}")

        if right_dtype == "Datetime":
            self.convert_datetime(col)
        elif right_dtype == "Category":
            self.convert_category(col)
        elif right_dtype == "Bool":
            self.convert_boolean(col)

    def convert_datetime(self, col):
        self.dataset["train"][col] = pd.to_datetime(self.dataset["train"][col])
        self.metaset[col]["log"].append(
            f"dtype changed : {self.metaset[col]['dtype']} to Datetime")
        self.metaset[col]["dtype"] = "Datetime"

        answer = ask_boolean("Do you want to split datetime?")
        if answer:
            metaset, trainset = self.metaset, self.dataset["train"]

            metaset, trainset[f"{col}_year"] = add_col_info(
                metaset, trainset[col].dt.year, f"{col}_year")
            metaset, trainset[f"{col}_month"] = add_col_info(
                metaset, trainset[col].dt.month, f"{col}_month")
            metaset, trainset[f"{col}_day"] = add_col_info(
                metaset, trainset[col].dt.day, f"{col}_day")
            metaset, trainset[f"{col}_hour"] = add_col_info(
                metaset, trainset[col].dt.hour, f"{col}_hour")
            metaset, trainset[f"{col}_dow"] = add_col_info(
                metaset, trainset[col].dt.day_name(), f"{col}_dow")

            self.metaset = metaset
            self.dataset["train"] = trainset

    def convert_category(self, col):
        col_meta = self.metaset[col]
        col_data = self.dataset["train"][col]

        col_data = col_data.apply(str)
        col_meta["log"].append(
            f"dtype changed : {col_meta['dtype']} to Category")
        col_meta["dtype"] = "Category"

        col_meta["unique"] = col_data.unique()
        col_meta["rate"] = (col_data.value_counts(), )

        self.metaset[col] = col_meta
        self.dataset["train"][col] = col_data

    def convert_boolean(self, col):
        col_meta = self.metaset[col]
        col_data = self.dataset["train"][col]

        col_data = col_data.apply(str)
        col_meta["log"].append(
            f"dtype changed : {col_meta['dtype']} to Boolean")
        col_meta["dtype"] = "Boolean"

        col_meta["rate"] = col_data.value_counts()

        self.metaset[col] = col_meta
        self.dataset["train"][col] = col_data

    def get_meta_info(self, columns):
        info = list()
        for col in columns:
            col_meta = self.metaset[col]
            col_info = {
                "name": col,
                "dtype": col_meta["dtype"],
                "desc": col_meta["descript"],
            }
            for i in range(1, 6):
                col_info[f"sample{i}"] = self.dataset["train"][col][i]

            info.append(col_info)
        info_df = pd.DataFrame(info)
        self.logger.log(f" - Dtype \n {info_df}\n\n", level=3)
        return info_df
示例#3
0
def _main_(args):
    init_logger()
    logger = Logger()

    logger.log("Step 0 >> Setting ")

    logger.log("Step 1 >> Data Preparation")
    logger.log("- 1 : Data Collection ", level=1)
    loader = DataLoader(config_path="./config.json")

    logger.log("- 2 : Data Analization ", level=1)
    analyzer = DataAnalyzer(
        config_path="./config.json",
        dataset=loader.dataset,
        metaset=loader.metaset,
    )
    analyzer.analize()

    logger.log("Step 3 >> Model Generation")
    model_generator = ModelGenerator(config_path="./config.json")
    models = model_generator.models

    logger.log("Step 4 >> Data Preprocess")
    preprocessor = PreProcessor(
        config_path="./config.json",
        dataset=analyzer.dataset,
        metaset=analyzer.metaset,
    )

    # (x_value, x_label), (y_value, y_label) = preprocessor.label_split()

    logger.log("Step 5 >> Model Evaluation")
    models = model_generator.fit_model(
        dataset=analyzer.dataset,
        metaset=analyzer.metaset,
    )
示例#4
0
class DataLoader(object):
    """
    Data Loader
    """

    def __init__(self, config_path):
        self.config = open_json(config_path)

        self.filepath = self.config["dataset"]["filepath"]
        self.basepath = self.config["dataset"]["dirpath"]
        self.format = self.config["dataset"]["format"].lower()

        self.logger = Logger()
        self.dataset = self.load_dataset()  # 1.1
        self.metaset = self.load_metaset()  # 1.2

    def load_dataset(self):
        """
        1.1 Data Loading
        """
        self.logger.log(
            f"- 1.1 {self.config['dataset']['category']} type dataset loading .. ",
            level=2,
        )

        self.filepath = os.path.join(self.basepath, self.filepath)

        self.logger.log(f"- '{self.filepath}' is now loading...", level=2)

        dataset = {
            "train": self.read_csv("train"),
            "valid": self.read_csv("valid"),
            "test": self.read_csv("test"),
        }

        if dataset["test"] is None:
            dataset = self.split_dataset(dataset, "train", "test")

        return dataset

    def read_csv(self, name):
        filepath = os.path.join(self.basepath, self.filepath, name + "." + self.format)

        index_col = self.config["dataset"].get("index", None)
        index_col = index_col if index_col != "None" else None

        try:
            csv_file = open_csv(filepath=filepath, index_col=index_col)
            self.logger.log(f"- {name:5} data{csv_file.shape} is now loaded", level=3)

        except FileNotFoundError:
            csv_file = None

        return csv_file

    def split_dataset(self, dataset, origin, target):
        split_ratio = self.config["dataset"]["split_ratio"]

        dataset[origin], dataset[target] = train_test_split(
            dataset[origin], train_size=split_ratio, random_state=42
        )

        self.logger.log(
            f"- {origin:5} data{dataset[origin].shape}"
            f", {target:5} data{dataset[target].shape}"
            f"  (split ratio: {split_ratio})",
            level=3,
        )

        return dataset

    def load_metaset(self):
        """
        1.2
        """
        self.logger.log(f"- 1.2 Prepare metadata", level=2)

        def convert_dict(dtype):
            return {
                "Int64": "Num_int",
                "Float64": "Num_float",
                "object": "Cat",
            }[dtype.name]

        metaset = init_set_info(self.config, self.dataset)
        metaset = self.read_description(metaset)

        for i, col in enumerate(metaset["__columns__"]):
            col_data = self.dataset["train"][col].convert_dtypes()
            metaset[col] = init_col_info(metaset, col_data, col)

        return metaset

    def read_description(self, metaset):
        descfile = self.config["metaset"].get("descpath", None)
        if descfile is None:
            return metaset

        descpath = os.path.join(
            self.config["dataset"]["dirpath"],
            self.config["dataset"]["filepath"],
            descfile,
        )

        try:
            with open(descpath, "r", newline="\r\n") as desc_file:
                self.logger.log(f"- '{descpath}' is now loaded", level=3)

                desc_list = desc_file.read().splitlines()
                for desc_line in desc_list:
                    col, desc = desc_line.split(":")

                    metaset[col]["descript"] = desc.strip()

            return metaset

        except FileNotFoundError as e:
            self.logger.warn(f"Description File Not Found Error, '{descpath}'")
            return metaset