示例#1
0
    def run(self):
        with timer(self.name):
            train = table_load(table_name="train", cols=self.depends_on())
            test = table_load(
                table_name="test",
                cols=[
                    col for col in self.depends_on()
                    if col not in self.target_cols()
                ],
            )
            memo = table_load(table_name="memo")
            train, test, memo = self.create_features(train, test, memo)
            insert_cols(table_name="train", df=train)
            insert_cols(table_name="test", df=test)
            table_write(table_name="memo", df=memo)

            cv_train_tables = find_table_name(
                like="cv_train", unlike="stats")["table_name"].tolist()
            cv_test_tables = find_table_name(
                "cv_test", unlike="stats")["table_name"].tolist()
            if len(cv_train_tables) != len(cv_test_tables):
                raise ValueError("# of cv_train is not equal to # of cv_test!")
            for n_fold in range(len(cv_train_tables)):
                train = table_load(table_name=cv_train_tables[n_fold],
                                   cols=self.depends_on())
                test = table_load(
                    table_name=cv_test_tables[n_fold],
                    cols=[
                        col for col in self.depends_on()
                        if col not in self.target_cols()
                    ],
                )
                train, test, memo = self.create_features(train, test, memo)
                insert_cols(table_name=cv_train_tables[n_fold], df=train)
                insert_cols(table_name=cv_test_tables[n_fold], df=test)
示例#2
0
if len(drop_table_names) > 0:
    exec_query("".join([
        "DROP TABLE {};".format(drop_table_name)
        for drop_table_name in drop_table_names
    ]))

table_names = []
table_names += find_table_name(like="train")["table_name"].to_list()
table_names += find_table_name(like="test")["table_name"].to_list()
table_names += find_table_name(like="cv_result")["table_name"].to_list()

for table_name in tqdm(table_names):
    df = table_load(table_name=table_name)
    stats = pd.concat(
        [
            df.dtypes.rename("dtype").astype(str).to_frame(),
            df.isnull().sum().rename("null_count").to_frame(),
            df.describe().T.rename(columns={
                "25%": "per_25",
                "50%": "per_50",
                "75%": "per_75"
            }),
        ],
        axis=1,
        sort=False,
    )
    table_write(
        table_name="{}_stats".format(table_name),
        df=stats,
    )
示例#3
0
import os

import pandas as pd

from db import exec_query, table_write

print("Initializing Database...")

# Drop tables if they exist
exec_query("DROP TABLE IF EXISTS train;")
exec_query("DROP TABLE IF EXISTS test;")
exec_query("DROP TABLE IF EXISTS memo;")

# Read data
train = pd.read_csv(os.environ["PROJECT_DIR"] + "/input/train.csv")
test = pd.read_csv(os.environ["PROJECT_DIR"] + "/input/test.csv")
memo = pd.read_csv(os.environ["PROJECT_DIR"] + "/input/memo.csv")

# Insert train, test data into DB
table_write(table_name="train", df=train)
table_write(table_name="test", df=test)
table_write(table_name="memo", df=memo)

# Create Index
exec_query("CREATE INDEX train_index on train (index);")
exec_query("CREATE INDEX test_index on test (index);")
exec_query("CREATE INDEX memo_index on memo (index);")

print("Done!!")
示例#4
0
                categorical_features=categorical_cols,
                target_cols=target_cols,
                train_cols=train_cols,
                params=params,
            )

            valid["survived"] = y_pred
            y_pred = postprocessing(train=train, test=valid)

            cv_result = pd.DataFrame({
                "index":
                valid.index,
                "predicted":
                y_pred.flatten(),
                "real":
                y_real,
                "difference":
                y_pred.flatten() - y_real,
                "difference_abs":
                abs(y_pred.flatten() - y_real),
            })
            table_write(table_name="cv_result_{}".format(n_fold), df=cv_result)

            predicted = (y_pred.flatten() > 0.5).astype(int)
            accuracy = (predicted == y_real).sum() / len(predicted)
            accuracies.append(accuracy)

            print("Accuracy: {}".format(accuracy))

    print("Total Accuracy: {}".format(np.mean(accuracies)))
示例#5
0
from db import table_load, table_write
from utils import timer

if __name__ == "__main__":

    if len(sys.argv) == 2:
        config_file_name = sys.argv[1]
    else:
        config_file_name = "lightgbm_0"

    print("Config file Name: ", config_file_name)

    with timer("kfold"):
        config: dict = json.load(
            open("./configs/{}.json".format(config_file_name)))

        train = table_load("train")

        folds = StratifiedKFold(
            n_splits=config["cv"]["n_splits"],
            shuffle=True,
            random_state=config["cv"]["random_state"],
        ).split(train, train[config["features"]["target"]])

        for n_fold, (train_index, valid_index) in enumerate(folds):
            cv_train_df = train.loc[train_index]
            cv_test_df = train.loc[valid_index]
            table_write(table_name="cv_train_{}".format(n_fold),
                        df=cv_train_df)
            table_write(table_name="cv_test_{}".format(n_fold), df=cv_test_df)