def create_schema(): db = DBServices() dirs = glob.glob("./input/*") for dir_name in dirs: schema = dir_name.split("/")[-1] db.exec_query("CREATE SCHEMA IF NOT EXISTS {};".format(schema))
def insert_data(): db = DBServices() fnames = glob.glob("./input/*/*/*.csv") for fname in tqdm(fnames): _, _, schema, table_name, _ = fname.split("/") df = pd.read_csv(fname) db.df_to_table(table_name=table_name, schema=schema, df=df, replace=False)
def __init__(self, config_name: dict): self.config_name = config_name self.config = exp_config[config_name] self.db = DBServices() self.train_cols = self.config["features"]["train"] self.target_col = self.config["features"]["target"] self.model_name = self.config["model"]["name"] self.model = models[self.model_name]() self.schemas = self.db.find_schema( like=self.config["kfold_config_name"])["schema_name"].values self.metrics = metrics[self.config["metrics"]]
def __init__(self): self.db = DBServices() df = self.db.find_schema(like="", unlike="pg_") self.schemas = df[ df["schema_name"] != "information_schema"]["schema_name"].values if self.__class__.__name__.isupper(): self.name = self.__class__.__name__.lower() else: # Rename to snake_case self.name = re.sub("([A-Z])", lambda x: "_" + x.group(1).lower(), self.__class__.__name__).lstrip("_")
class Feature(metaclass=ABCMeta): def __init__(self): self.db = DBServices() df = self.db.find_schema(like="", unlike="pg_") self.schemas = df[ df["schema_name"] != "information_schema"]["schema_name"].values if self.__class__.__name__.isupper(): self.name = self.__class__.__name__.lower() else: # Rename to snake_case self.name = re.sub("([A-Z])", lambda x: "_" + x.group(1).lower(), self.__class__.__name__).lstrip("_") @abstractmethod def create_features(self): raise NotImplementedError
def split_tables_into_kfold(kfold_config_name: str): db = DBServices() n_splits = kfold_config[kfold_config_name]["n_splits"] seed = kfold_config[kfold_config_name]["seed"] df = db.table_load(schema="public", table_name="train", cols=[INDEX_COL, TARGET_COL]) folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(df[INDEX_COL], df[TARGET_COL]) for n_fold, (train_index, test_index) in enumerate(folds): with timer("Split No.{}".format(n_fold)): schema = "{}_{}".format(kfold_config_name, n_fold) db.exec_query("CREATE SCHEMA IF NOT EXISTS {};".format(schema)) db.exec_query("DROP TABLE IF EXISTS {}.train;".format(schema)) db.exec_query("DROP TABLE IF EXISTS {}.test;".format(schema)) query = "SELECT * INTO {0}.train FROM public.train WHERE {1} IN ({2}) ORDER BY {1};".format( schema, INDEX_COL, ", ".join(df.iloc[train_index][INDEX_COL].astype(str)), ) db.exec_query(query) query = "SELECT * INTO {0}.test FROM public.train WHERE {1} IN ({2}) ORDER BY {1};".format( schema, INDEX_COL, ", ".join(df.iloc[test_index][INDEX_COL].astype(str))) db.exec_query(query)
class Model: def __init__(self, config_name: dict): self.config_name = config_name self.config = exp_config[config_name] self.db = DBServices() self.train_cols = self.config["features"]["train"] self.target_col = self.config["features"]["target"] self.model_name = self.config["model"]["name"] self.model = models[self.model_name]() self.schemas = self.db.find_schema( like=self.config["kfold_config_name"])["schema_name"].values self.metrics = metrics[self.config["metrics"]] def cross_validation(self): for schema in self.schemas: train = self.db.table_load( schema=schema, table_name="train", cols=self.train_cols + self.target_col + [INDEX_COL], ) test = self.db.table_load( schema=schema, table_name="test", cols=self.train_cols + self.target_col + [INDEX_COL], ) self.model = models[self.model_name]( **self.config["model"]["params"]) self.model.fit(train[self.train_cols], train[self.target_col].iloc[:, 0]) pred = self.model.predict(test[self.train_cols]) score = self.metrics(test[self.target_col].iloc[:, 0], pred) print() print("========================================") print("{}: {} Score ::: {}".format(self.config_name, schema, score)) print("========================================") print() result_df = pd.DataFrame({ INDEX_COL: test[INDEX_COL], "pred": pred, "real": test[self.target_col].iloc[:, 0], }) self.db.df_to_table( table_name=self.config_name + "_result", schema=schema, df=result_df, replace=True, ) if not os.path.exists("./output/models/{}".format( self.config_name)): os.makedirs("./output/models/{}".format(self.config_name)) with open( "./output/models/{}/{}.pickle".format( self.config_name, schema), "wb") as f: pickle.dump(self.model, f) def predict(self): schema = "public" train = self.db.table_load( schema=schema, table_name="train", cols=self.train_cols + self.target_col + [INDEX_COL], ) test = self.db.table_load(schema=schema, table_name="test", cols=self.train_cols + [INDEX_COL]) self.model = models[self.model_name](**self.config["model"]["params"]) self.model.fit(train[self.train_cols], train[self.target_col].iloc[:, 0]) pred = self.model.predict(test[self.train_cols]) if not os.path.exists("./output/models/{}".format(self.config_name)): os.makedirs("./output/models/{}".format(self.config_name)) with open( "./output/models/{}/{}.pickle".format(self.config_name, schema), "wb") as f: pickle.dump(self.model, f) result_df = pd.DataFrame({ INDEX_COL: test[INDEX_COL], TARGET_COL: pred }) self.create_submission(result_df=result_df) def create_submission(self, result_df: pd.DataFrame): submission_file_prefix = "./output/submission/submission_{}".format( pd.to_datetime("today").strftime("%Y-%m-%d")) submission_no = len(glob.glob(submission_file_prefix + "_*.csv")) + 1 submission_file_name = "{}_{}.csv".format(submission_file_prefix, submission_no) result_df.to_csv(submission_file_name, index=False) print("Sumission file: {} saved!".format(submission_file_name))