def create_schema():
    db = DBServices()
    dirs = glob.glob("./input/*")

    for dir_name in dirs:
        schema = dir_name.split("/")[-1]
        db.exec_query("CREATE SCHEMA IF NOT EXISTS {};".format(schema))
def insert_data():
    db = DBServices()
    fnames = glob.glob("./input/*/*/*.csv")

    for fname in tqdm(fnames):
        _, _, schema, table_name, _ = fname.split("/")
        df = pd.read_csv(fname)

        db.df_to_table(table_name=table_name, schema=schema, df=df, replace=False)
 def __init__(self, config_name: dict):
     self.config_name = config_name
     self.config = exp_config[config_name]
     self.db = DBServices()
     self.train_cols = self.config["features"]["train"]
     self.target_col = self.config["features"]["target"]
     self.model_name = self.config["model"]["name"]
     self.model = models[self.model_name]()
     self.schemas = self.db.find_schema(
         like=self.config["kfold_config_name"])["schema_name"].values
     self.metrics = metrics[self.config["metrics"]]
示例#4
0
 def __init__(self):
     self.db = DBServices()
     df = self.db.find_schema(like="", unlike="pg_")
     self.schemas = df[
         df["schema_name"] != "information_schema"]["schema_name"].values
     if self.__class__.__name__.isupper():
         self.name = self.__class__.__name__.lower()
     else:
         # Rename to snake_case
         self.name = re.sub("([A-Z])", lambda x: "_" + x.group(1).lower(),
                            self.__class__.__name__).lstrip("_")
示例#5
0
class Feature(metaclass=ABCMeta):
    def __init__(self):
        self.db = DBServices()
        df = self.db.find_schema(like="", unlike="pg_")
        self.schemas = df[
            df["schema_name"] != "information_schema"]["schema_name"].values
        if self.__class__.__name__.isupper():
            self.name = self.__class__.__name__.lower()
        else:
            # Rename to snake_case
            self.name = re.sub("([A-Z])", lambda x: "_" + x.group(1).lower(),
                               self.__class__.__name__).lstrip("_")

    @abstractmethod
    def create_features(self):
        raise NotImplementedError
def split_tables_into_kfold(kfold_config_name: str):
    db = DBServices()

    n_splits = kfold_config[kfold_config_name]["n_splits"]
    seed = kfold_config[kfold_config_name]["seed"]

    df = db.table_load(schema="public",
                       table_name="train",
                       cols=[INDEX_COL, TARGET_COL])
    folds = StratifiedKFold(n_splits=n_splits, shuffle=True,
                            random_state=seed).split(df[INDEX_COL],
                                                     df[TARGET_COL])

    for n_fold, (train_index, test_index) in enumerate(folds):
        with timer("Split No.{}".format(n_fold)):
            schema = "{}_{}".format(kfold_config_name, n_fold)
            db.exec_query("CREATE SCHEMA IF NOT EXISTS {};".format(schema))
            db.exec_query("DROP TABLE IF EXISTS {}.train;".format(schema))
            db.exec_query("DROP TABLE IF EXISTS {}.test;".format(schema))

            query = "SELECT * INTO {0}.train FROM public.train WHERE {1} IN ({2}) ORDER BY {1};".format(
                schema,
                INDEX_COL,
                ", ".join(df.iloc[train_index][INDEX_COL].astype(str)),
            )
            db.exec_query(query)
            query = "SELECT * INTO {0}.test FROM public.train WHERE {1} IN ({2}) ORDER BY {1};".format(
                schema, INDEX_COL,
                ", ".join(df.iloc[test_index][INDEX_COL].astype(str)))
            db.exec_query(query)
class Model:
    def __init__(self, config_name: dict):
        self.config_name = config_name
        self.config = exp_config[config_name]
        self.db = DBServices()
        self.train_cols = self.config["features"]["train"]
        self.target_col = self.config["features"]["target"]
        self.model_name = self.config["model"]["name"]
        self.model = models[self.model_name]()
        self.schemas = self.db.find_schema(
            like=self.config["kfold_config_name"])["schema_name"].values
        self.metrics = metrics[self.config["metrics"]]

    def cross_validation(self):
        for schema in self.schemas:
            train = self.db.table_load(
                schema=schema,
                table_name="train",
                cols=self.train_cols + self.target_col + [INDEX_COL],
            )
            test = self.db.table_load(
                schema=schema,
                table_name="test",
                cols=self.train_cols + self.target_col + [INDEX_COL],
            )

            self.model = models[self.model_name](
                **self.config["model"]["params"])
            self.model.fit(train[self.train_cols],
                           train[self.target_col].iloc[:, 0])
            pred = self.model.predict(test[self.train_cols])

            score = self.metrics(test[self.target_col].iloc[:, 0], pred)
            print()
            print("========================================")
            print("{}: {} Score ::: {}".format(self.config_name, schema,
                                               score))
            print("========================================")
            print()

            result_df = pd.DataFrame({
                INDEX_COL: test[INDEX_COL],
                "pred": pred,
                "real": test[self.target_col].iloc[:, 0],
            })
            self.db.df_to_table(
                table_name=self.config_name + "_result",
                schema=schema,
                df=result_df,
                replace=True,
            )

            if not os.path.exists("./output/models/{}".format(
                    self.config_name)):
                os.makedirs("./output/models/{}".format(self.config_name))
            with open(
                    "./output/models/{}/{}.pickle".format(
                        self.config_name, schema), "wb") as f:
                pickle.dump(self.model, f)

    def predict(self):
        schema = "public"
        train = self.db.table_load(
            schema=schema,
            table_name="train",
            cols=self.train_cols + self.target_col + [INDEX_COL],
        )
        test = self.db.table_load(schema=schema,
                                  table_name="test",
                                  cols=self.train_cols + [INDEX_COL])

        self.model = models[self.model_name](**self.config["model"]["params"])
        self.model.fit(train[self.train_cols], train[self.target_col].iloc[:,
                                                                           0])
        pred = self.model.predict(test[self.train_cols])

        if not os.path.exists("./output/models/{}".format(self.config_name)):
            os.makedirs("./output/models/{}".format(self.config_name))
        with open(
                "./output/models/{}/{}.pickle".format(self.config_name,
                                                      schema), "wb") as f:
            pickle.dump(self.model, f)

        result_df = pd.DataFrame({
            INDEX_COL: test[INDEX_COL],
            TARGET_COL: pred
        })

        self.create_submission(result_df=result_df)

    def create_submission(self, result_df: pd.DataFrame):

        submission_file_prefix = "./output/submission/submission_{}".format(
            pd.to_datetime("today").strftime("%Y-%m-%d"))

        submission_no = len(glob.glob(submission_file_prefix + "_*.csv")) + 1
        submission_file_name = "{}_{}.csv".format(submission_file_prefix,
                                                  submission_no)
        result_df.to_csv(submission_file_name, index=False)
        print("Sumission file: {} saved!".format(submission_file_name))