예제 #1
0
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config):
    params = {
        "objective": "regression" if config.is_regression() else "binary",
        "metric": "rmse" if config.is_regression() else "auc",
        "verbosity": -1,
        "seed": 1,
    }

    X_sample, y_sample = data_sample(X, y, config, nrows=20000)
    hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config)

    X_train, X_val, y_train, y_val = data_split(X, y, config)

    config["model"] = lgb.train(
        {**params, **hyperparams},
        lgb.Dataset(X_train, label=y_train),
        5000,
        lgb.Dataset(X_val, label=y_val),
        early_stopping_rounds=100,
        verbose_eval=100,
    )
    config.save()

    try:
        with time_limit(config.time_left() - 10):
            config["model"] = lgb.train(
                {**params, **hyperparams},
                lgb.Dataset(X, label=y),
                int(1.2 * config["model"].best_iteration),
            )
    except TimeoutException:
        Log.print("Timed out!")
예제 #2
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)

        train(X, y, self.config)

    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {
            "line_id": [],
            "prediction": [],
        }

        for X in pd.read_csv(test_csv,
                             encoding="utf-8",
                             low_memory=False,
                             dtype=self.config["dtype"],
                             parse_dates=self.config["parse_dates"],
                             chunksize=self.config["nrows"]):
            result["line_id"] += list(X["line_id"])
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
            logf('SCORE:', score)
        else:
            score = None

        return result, score

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
예제 #3
0
def pipeline(
        df: pd.DataFrame,
        config: Config,
        train_csv: str = None,
        test_csv: str = None,
        prediction_csv: str = None) -> (pd.DataFrame, Optional[np.float64]):

    if config.is_train():
        config['stages'] = {}

    for ids, stage in enumerate(config['graph']):
        if len(stage) == 0 or stage[0] is None or stage[0] == '':
            config["stage"] = '{0}/n{1}'.format(
                config["stage"],
                'Error value stage "{0}" in pipeline'.format(stage))
            raise ValueError(config["stage"])

        config["stage"] = stage[0]
        config["stage_nb"] = ids

        if config.is_train():
            config['stages'][config["stage"]] = {}

        config['stages'][config["stage"]]['time'] = 0
        start_time = time.time()

        if stage[0] == 'Start':
            continue
        # elif stage[0] == 'End':
        #     break

        elif not stage[0] in config['params']['pipeline']:
            config["stage"] = '{0}/n{1}'.format(
                config["stage"],
                'Unknow node "{0}" in pipeline'.format(stage[0]))
            raise ValueError(config["stage"])

        elif not config['params']['pipeline'][stage[0]]['node'] in _node_map:
            config["stage"] = '{0}/n{1}'.format(
                config["stage"], 'Unknow node "{0}" in _node_map'.format(
                    config['params']['pipeline'][stage[0]]['node']))
            raise ValueError(config["stage"])

        node = _node_map[config['params']['pipeline'][stage[0]]['node']]
        if node.name == 'read_df':
            if config.is_train():
                df = node(train_csv, config)

        elif 'args' in config['params']['pipeline'][stage[0]] \
                and len(config['params']['pipeline'][stage[0]]['args'])!=0:
            node.function(df, config,
                          **config['params']['pipeline'][stage[0]]['args'])
        else:
            node(df, config)

        stage_time_inc(config, start_time, stage[0])
예제 #4
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        config_sample = copy.deepcopy(config)
        config.limit_time_fraction(0.1)
        for i in range(20):
            if config.is_time_fraction_limit():
                break

            df_sample = df.sample(min(3000, len(df)), random_state=i).copy()
            transform(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(selected_columns) > 0:
                X = X.drop(selected_columns, axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

        Log.print("Selected columns: {}".format(selected_columns))

        drop_number_columns = [c for c in df if c.startswith("number_") and c not in selected_columns]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        config["date_columns"] = {}
        for c in [c for c in selected_columns if c.startswith("datetime_")]:
            d = c.split("_")
            date_col = d[0] + "_" + d[1]
            date_part = d[2]

            if date_col not in config["date_columns"]:
                config["date_columns"][date_col] = []

            config["date_columns"][date_col].append(date_part)

        drop_datetime_columns = [c for c in df if c.startswith("datetime_") and c not in config["date_columns"]]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        Log.print("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        Log.print("Drop datetime columns: {}".format(config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
예제 #5
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)
        train(X, y, self.config)

    def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(test_csv, self.config)
        result = {
            "line_id": list(df["line_id"]),
            "prediction": [],
        }

        def chunker(seq, size):
            return (seq[pos:pos+size] for pos in range(0, len(seq), size))

        for chunk in chunker(df, 100000):
            X = chunk.copy()
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.sort_values("line_id", inplace=True)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score

    def save(self):
        self.config.save()

    def load(self):
        self.config.load()
예제 #6
0
    def __init__(self, model_dir: str, params: dict, verbose: int=0):
        self.config = Config(model_dir, params, verbose=verbose)
        self.verbose=verbose

        if not 'memory' in self.config['params']:
            self.config['params']['memory'] = {}
        if not 'max_size_mb' in self.config['params']['memory']:
            self.config['params']['memory']['max_size_mb'] = 2
        if not 'max_size_train_samples' in self.config['params']['memory']:
            self.config['params']['memory']['max_size_train_samples'] = 10000
        if not 'field_target_name' in self.config['params']:
            self.config['params']['field_target_name'] = 'target'
예제 #7
0
def train_h2o(X: pd.DataFrame, y: pd.Series, config: Config):

    h2o.init()

    X["target"] = y
    train = h2o.H2OFrame(X)
    train_x = train.columns
    train_y = "target"
    train_x.remove(train_y)

    if config["mode"] == "classification":
        train[train_y] = train[train_y].asfactor()

    aml = H2OAutoML(max_runtime_secs=int(config.time_left() * 0.9),
                    max_models=20,
                    nfolds=3,
                    exclude_algos=["GBM", "DeepLearning", "DRF"],
                    seed=42)

    aml.train(x=train_x, y=train_y, training_frame=train)

    config['params']['pipeline'][config["stage"]]["model"] = h2o.save_model(
        model=aml.leader, path=config.model_dir + "/h2o.model", force=True)
    if config.verbose: print(aml.leaderboard)

    X.drop("target", axis=1, inplace=True)
예제 #8
0
def leak_detect(df: pd.DataFrame, config: Config) -> bool:
    if config.is_predict():
        return "leak" in config

    id_cols = [c for c in df if c.startswith('id_')]
    dt_cols = [c for c in df if c.startswith('datetime_')]

    if id_cols and dt_cols:
        num_cols = [c for c in df if c.startswith('number_')]
        for id_col in id_cols:
            group = df.groupby(by=id_col).get_group(df[id_col].iloc[0])

            for dt_col in dt_cols:
                sorted_group = group.sort_values(dt_col)

                for lag in range(-1, -10, -1):
                    for col in num_cols:
                        corr = sorted_group['target'].corr(sorted_group[col].shift(lag))
                        if corr >= 0.99:
                            config["leak"] = {
                                "num_col": col,
                                "lag": lag,
                                "id_col": id_col,
                                "dt_col": dt_col,
                            }
                            return True

    return False
예제 #9
0
파일: read.py 프로젝트: rshekhovtsov/CFT
def read_df(csv_path: str, config: Config) -> pd.DataFrame:
    if "dtype" not in config:
        preview_df(csv_path, config)

    df = optimize_dataframe(pandas_read_csv(csv_path, config))
    if config.is_train():
        config["nrows"] = len(df)

    return df
예제 #10
0
def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config):
    X_train, X_val, y_train, y_val = data_split(X, y, config, test_size=0.5)
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val)

    space = {
        "learning_rate": hp.choice("learning_rate", np.arange(0.01, 0.05, 0.01)),
        "boost_from_average": hp.choice("boost_from_average", [True, False]),
        "is_unbalance": hp.choice("is_unbalance", [True, False]),
        "zero_as_missing": hp.choice("zero_as_missing", [True, False]),
        "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 7]),
        "num_leaves": hp.choice("num_leaves", [11, 31, 51, 101, 151, 201]),
        "feature_fraction": hp.choice("feature_fraction", np.arange(0.5, 1.0, 0.1)),
        "bagging_fraction": hp.choice("bagging_fraction", np.arange(0.5, 1.0, 0.1)),
        "bagging_freq": hp.choice("bagging_freq", [1, 3, 5, 10, 20, 50]),
        "reg_alpha": hp.uniform("reg_alpha", 0, 10),
        "reg_lambda": hp.uniform("reg_lambda", 0, 10),
        "min_child_weight": hp.uniform("min_child_weight", 0, 10),
    }

    config.limit_time_fraction(0.15)

    def objective(hyperparams):
        if config.is_time_fraction_limit():
            score = np.inf if config.is_regression() else 0
            return {'loss': score, 'status': STATUS_OK}

        model = lgb.train({**params, **hyperparams}, train_data, 300, valid_data,
                          early_stopping_rounds=100, verbose_eval=False)

        score = model.best_score["valid_0"][params["metric"]]
        Log.print(score)
        if config.is_classification():
            score = -score

        return {'loss': score, 'status': STATUS_OK}

    trials = Trials()
    best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=100, verbose=1,
                         rstate= np.random.RandomState(1))

    hyperparams = space_eval(space, best)
    Log.print("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams))
    return hyperparams
예제 #11
0
def read_df(csv_path: str, config: Config) -> pd.DataFrame:
    if "dtype" not in config:
        preview_df(csv_path, config)

    df = pandas_read_csv(csv_path, config)
    if config.is_train():
        config["nrows"] = len(df)
        config["target_data"] = df['target'].copy()

    return df
예제 #12
0
def read_df(csv_path: str, config: Config) -> pd.DataFrame:
    if "dtype" not in config:
        preview_df(csv_path, config)

    df = pandas_read_csv(csv_path, config)
    if config.is_train():
        config["nrows_stage_nb"] = 0
        config["nrows"] = len(df)

    return df
예제 #13
0
def main(parse):
    # load configs
    dconf_path = 'config/data.json'
    mconf_path = 'config/word2vec.json'
    dconf = Config(dconf_path)
    mconf = Config(mconf_path)

    # load w2v model and train
    if mconf.model == 'cbow':
        w2v = CbowModel(dconf, mconf)
    else:
        w2v = SkipGramModel(dconf, mconf)

    w2v.load('trained.pth')

    # test w2v
    word = 'hospital'
    print(w2v.nearest(word))

    print(w2v.similarity(word, 'attacks').item())
    print(w2v.similarity(word, word).item())
예제 #14
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        for i in range(3):

            config_sample = copy.deepcopy(config)

            df_sample = df.sample(frac=0.05, random_state=i).copy(deep=True)
            df_sample = preprocess_pipeline(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

            df_size_mb = df.drop(
                list(set(df) - set(selected_columns)), 1,
                errors='ignore').memory_usage(deep=True).sum() / 1024 / 1024
            if df_size_mb < 2 * 1024:
                break

        selected_columns = list(set(selected_columns))

        log("Selected columns: {}".format(selected_columns))

        drop_number_columns = [
            c for c in df if (c.startswith("number_") or c.startswith("id_"))
            and c not in selected_columns
        ]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        drop_datetime_columns = [
            c for c in df
            if c.startswith("datetime_") and c not in selected_columns
        ]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        log("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        log("Drop datetime columns: {}".format(
            config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
예제 #15
0
파일: main.py 프로젝트: tree-park/word2vec
def main(args):
    # load configs
    dconf_path = 'config/data.json'
    mconf_path = 'config/word2vec.json'
    dconf = Config(dconf_path)
    mconf = Config(mconf_path)
    # load w2v model and train
    if mconf.model == 'cbow':
        w2v = CbowModel(dconf, mconf, args.mode)
    else:
        w2v = SkipGramModel(dconf, mconf, args.mode)

    if args.mode != 'test':
        w2v.train()
        w2v.save(dconf.saved_file)

    # test w2v
    word = 'hospital'
    print(w2v.nearest(word))
    print(w2v.similarity(word, 'attacks').item())
    print(w2v.similarity(word, word).item())
예제 #16
0
def fillna(df: pd.DataFrame, config: Config, args: dict = {}):

    if len(args) != 0:

        for k, v in args.items():

            if config.is_train():
                lst_columns = [c for c in df if c.startswith(k)]
                config['stages'][config["stage"]][k] = {
                    'lst_columns': lst_columns
                }

                if len(lst_columns) != 0:
                    if 'agg' in v or 'value' in v:

                        if config.is_train():
                            s_fillna_values = calc_columns_metric(
                                df,
                                lst_columns,
                                metric=v['agg'] if 'agg' in v else None,
                                value=v['value'] if 'value' in v else None)

                            config['stages'][config["stage"]][k][
                                'fillna_values'] = deepcopy(s_fillna_values)

            if len(config['stages'][config["stage"]][k]['lst_columns']) != 0:
                fillna_columns(
                    df, config['stages'][config["stage"]][k]['fillna_values'])

    else:

        for c in [c for c in df if c.startswith("number_")]:
            df[c].fillna(-1, inplace=True)

        for c in [c for c in df if c.startswith("string_")]:
            df[c].fillna("", inplace=True)

        for c in [c for c in df if c.startswith("datetime_")]:
            df[c].fillna(datetime.datetime(1970, 1, 1), inplace=True)
예제 #17
0
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float=2.0):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb > max_size_mb:
            mem_per_row = df_size_mb / len(df)
            sample_rows = int(max_size_mb / mem_per_row)

            log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows.".format(df_size_mb, len(df), sample_rows))
            _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1)
            df.drop(df_drop.index, inplace=True)

            config["nrows"] = sample_rows
        else:
            config["nrows"] = len(df)
예제 #18
0
def check_columns_exists(df: pd.DataFrame,
                         config: Config,
                         key_stage: str,
                         drop_columns_test: bool = True):
    field_target_name = config['params']['field_target_name']
    if config.is_train():
        if not 'columns_exists' in config['params']['pipeline'][
                config["stage"]]:
            config['params']['pipeline'][
                config["stage"]]['columns_exists'] = {}
            if not field_target_name in df.columns:
                raise ValueError(
                    'Column y="{0}" not exists in train dataset'.format(
                        field_target_name))

        config['params']['pipeline'][config["stage"]]['columns_exists'][key_stage] = \
                                    set([x for x in df.columns if x!=field_target_name])

    elif 'columns_exists' in config['params']['pipeline'][config["stage"]]:
        if key_stage in config['params']['pipeline'][
                config["stage"]]['columns_exists']:
            set_columns = config['params']['pipeline'][
                config["stage"]]['columns_exists'][key_stage] - set(df.columns)
            if len(set_columns) != 0:
                raise ValueError(
                    'Columns "{0}" not exists in test dataset on stage {1}'.
                    format(str(set_columns), key_stage))

            set_columns = set(df.columns) - config['params']['pipeline'][
                config["stage"]]['columns_exists'][key_stage]
            if len(set_columns) != 0:
                if drop_columns_test:
                    df.drop(columns=[x for x in set_columns], inplace=True)
                else:
                    raise ValueError(
                        'Columns "{0}" not exists in train dataset on stage {1}'
                        .format(str(set_columns), key_stage))
        else:
            raise ValueError(
                'Preprocess stage "{0}" not exists'.format(key_stage))
예제 #19
0
def scale(df: pd.DataFrame, config: Config):
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    scale_columns = [
        c for c in df if c.startswith("number_") and df[c].dtype != np.int8
        and c not in config["categorical_columns"]
    ]

    if len(scale_columns) > 0:

        if config.is_train():
            config['stages'][config["stage"]]['scale_columns'] = deepcopy(
                scale_columns)

            config['stages'][config["stage"]]['model'] = StandardScaler(
                copy=False)
            config['stages'][config["stage"]]['scale_columns'] = deepcopy(
                scale_columns)
            config['stages'][config["stage"]]['model'].fit(
                df[scale_columns].astype(np.float32))

        df[config['stages'][config["stage"]]['scale_columns']] = \
            config['stages'][config["stage"]]['model'].transform( \
                       df[config['stages'][config["stage"]]['scale_columns']].astype(np.float32) ).astype(np.float32)
예제 #20
0
def subsample(df: pd.DataFrame, config: Config):
    if config.is_train():
        # df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        df_size_mb, sample_rows = get_sample_rows(df, config)

        if df_size_mb > config['params']['memory']['max_size_mb']:
            # mem_per_row = df_size_mb / len(df)
            # sample_rows = int(config['params']['memory']['max_size_mb'] / mem_per_row)

            log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows." \
                    .format(df_size_mb, len(df), sample_rows), config.verbose)
            _, df_drop = train_test_split(df,
                                          train_size=sample_rows,
                                          random_state=1)
            df.drop(df_drop.index, inplace=True)

            config["nrows"] = sample_rows
        elif config["nrows_stage_nb"] == 0:
            config["nrows"] = max(sample_rows, len(df))
        else:
            config["nrows"] = min(sample_rows, config["nrows"])

        config["nrows_stage_nb"] = config["stage_nb"]
예제 #21
0
import pickle

from lib.util import Config
from lib.kor2eng import LangTranslator
from lib.util import load_data
from lib.data_preprocess import Vocab, preprocessor
from lib.model.seq2seq import BiLSTMSeq2Seq
from transformers.lib.model.transformer import Transformer

file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)

# load configs
dconf_path = 'config/data.json'
mconf_path = 'config/lm.json'
dconf = Config(dconf_path)
mconf = Config(mconf_path)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")
print('Using device:', device)

# try:
#     with open('preprocessed_data.pickle', 'rb') as f:
#         saved_obj = pickle.load(f)
#         ko_corpus, ko_vocab, en_corpus, en_vocab = saved_obj
# except:
#
#     # load & preprocess corpus
#     ko_corpus = preprocessor(load_data(dconf.train_ko_path), lang='ko')
#     en_corpus = preprocessor(load_data(dconf.train_en_path), lang='en')
예제 #22
0
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config):
    params = {
        "objective":
        "regression" if config["mode"] == "regression" else "binary",
        "metric": "rmse" if config["mode"] == "regression" else "auc",
        "verbosity": -1,
        "seed": 1,
    }

    X_sample, y_sample = data_sample(X, y)
    hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config)

    for i in range(1):
        print(
            '################################################################## cv '
            + str(i))
        t1_bagging = time.time()
        params['seed'] = i + 1
        # cv
        nfold = 5
        if config["mode"] == 'classification':
            skf = StratifiedKFold(n_splits=nfold,
                                  shuffle=True,
                                  random_state=777)
        else:
            skf = KFold(n_splits=nfold, shuffle=True, random_state=777)
        skf_split = skf.split(X, y)

        log('####################################################################### begin cv'
            )
        log('####### cur time = ' +
            str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
        score_list = []
        config["model"] = []
        for fid, (train_idx, valid_idx) in enumerate(skf_split):
            t1_cv = time.time()
            print("FoldID:{}".format(fid))
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
            dtrain = lgb.Dataset(X_train, label=y_train)
            dvalid = lgb.Dataset(X_valid, label=y_valid, reference=dtrain)

            cur_model = lgb.train({
                **params,
                **hyperparams
            },
                                  dtrain,
                                  3000,
                                  dvalid,
                                  early_stopping_rounds=50,
                                  verbose_eval=100)
            config["model"].append(cur_model)

            score_list.append(cur_model.best_score)
            # gc.collect()
            sys.stdout.flush()
            t2_cv = time.time()
            time_left = config.time_left()
            print('######### cv' + str(time_left))
            if (t2_cv - t1_cv) * (nfold - fid + 1) >= time_left:
                pass
                #break

        log('######################################################################### end cv'
            )
        log('####### cur time = ' +
            str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")))

        valid_auc = np.array(
            [i['valid_0'][params['metric']] for i in score_list])
        print('valid', valid_auc, np.mean(valid_auc))
        cv_score = pd.DataFrame(
            {'cv': np.hstack([valid_auc, np.mean(valid_auc)])})
        path = config['path_pred']
        print(path)
        cv_score.to_csv(path + '/cv_score_' + str(i) + '.csv', index=False)

        t2_bagging = time.time()
        time_left = config.time_left()
        print('#########bagging' + str(time_left))
        if (t2_bagging - t1_bagging) * 1.5 >= time_left:
            #break
            pass
예제 #23
0
 def __init__(self, model_dir: str):
     os.makedirs(model_dir, exist_ok=True)
     self.config = Config(model_dir)
예제 #24
0
def time_series_detect(df: pd.DataFrame, config: Config):
    sample_size = 10000
    model_params = {
        "objective": "regression" if config["mode"] == "regression" else "binary",
        "metric": "rmse" if config["mode"] == "regression" else "auc",
        "learning_rate": 0.01,
        "verbosity": -1,
        "seed": 1,
        "max_depth": -1,
    }

    if config.is_train():
        datetime_columns = [c for c in df if c.startswith("datetime_")]
        id_columns = [c for c in df if c.startswith("id_")]

        sort_columns = []
        for dc in datetime_columns:
            sort_columns.append([dc])
            for ic in id_columns:
                sort_columns.append([ic, dc])
        else:
            for ic in id_columns:
                sort_columns.append([ic])

        scores = []
        config.limit_time_fraction(0.1)
        for sc in sort_columns:
            if config.is_time_fraction_limit():
                break

            Log.silent(True)
            df.sort_values(sc, inplace=True)

            config_sample = copy.deepcopy(config)
            df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
            df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]]
            shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None)
            transform(df_sample, config_sample)

            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)
            X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5)

            model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test),
                              early_stopping_rounds=100, verbose_eval=False)
            score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]]

            sampled_columns = [c for c in X if "_shift" not in c]
            model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test),
                              early_stopping_rounds=100, verbose_eval=False)
            score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]]

            if config.is_classification():
                score_sorted = -score_sorted
                score_sampled = -score_sampled

            Log.silent(False)
            Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled))
            score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled)
            if score_ratio >= 1.03:
                Log.print(score_ratio)
                scores.append((score_sorted, sc))

        if len(scores) > 0:
            scores = sorted(scores, key=lambda x: x[0])
            Log.print("Scores: {}".format(scores))
            config["sort_values"] = scores[0][1]
            df.sort_values(config["sort_values"], inplace=True)

            config_sample = copy.deepcopy(config)
            df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
            shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None)
            transform(df_sample, config_sample)

            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000)
            fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns)
            fi = fi[fi > 0].sort_values()
            selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist()

            selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c]
            if len(selected_shift_columns) > 0:
                Log.print("Shift columns: {}".format(selected_shift_columns))
                config["shift_columns"] = selected_shift_columns

    if "shift_columns" in config:
        shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])
예제 #25
0
import torch
import pickle

from lib.util import Config
from lib.kor2eng import LangTranslator
from lib.util import load_data
from lib.data_preprocess import Vocab, preprocessor
from lib.model.seq2seq import BiLSTMSeq2Seq
from transformers.lib.model.transformer import Transformer

# load configs
dconf_path = 'config/data.json'
mconf_path = 'config/lm.json'
dconf = Config(dconf_path)
mconf = Config(mconf_path)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")
print('Using device:', device)

with open('preprocessed_data.pickle', 'rb') as f:
    saved_obj = pickle.load(f)
    _, ko_vocab, _, en_vocab = saved_obj

# define lm model
if mconf.model == 'seq2seq':
    model = BiLSTMSeq2Seq(
        len(ko_vocab) + 1,
        len(en_vocab) + 1, mconf.emb_dim, mconf.d_m)
elif mconf.model == 'transformers':
    model = Transformer(mconf.d_m,
예제 #26
0
def train_lightgbm(X: pd.DataFrame, y: pd.Series, stored_models_key: str,
                   save_to_disk: bool, config: Config):

    config[stored_models_key] = []

    data = lgb.Dataset(X, label=y, free_raw_data=False)
    data.construct()
    gc.collect()

    params = {
        "objective": config["objective"],
        "metric": config["metric"],
        "seed": config["seed"],
        'num_threads': config['n_threads'],
        "verbosity": -1,
    }

    seed = config["seed"]

    space = {
        "learning_rate":
        hp.uniform("learning_rate", 0.01, 0.4),
        "max_depth":
        hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 10]),
        "num_leaves":
        hp.choice("num_leaves", np.linspace(4, 200, 50, dtype=int)),
        "feature_fraction":
        hp.quniform("feature_fraction", 0.1, 1., 0.1),
        "bagging_fraction":
        hp.quniform("bagging_fraction", 0.1, 1., 0.1),
        "bagging_freq":
        hp.choice("bagging_freq", np.linspace(0, 20, 10, dtype=int)),
        "reg_alpha":
        hp.uniform("reg_alpha", 0, 30),
        "reg_lambda":
        hp.uniform("reg_lambda", 0, 30),
        "min_child_weight":
        hp.uniform('min_child_weight', 1e-10, 20),
        "max_bin":
        hp.choice('max_bin', [50, 100, 255]),
        'boosting_type':
        hp.choice(
            'boosting_type',
            [
                {
                    'boosting_type': 'gbdt',
                },
                {
                    'boosting_type':
                    'dart',
                    'drop_rate':
                    hp.uniform('drop_rate', 0.01, 0.6),
                    'max_drop':
                    hp.choice(
                        "max_drop",
                        np.linspace(5,
                                    config["train_num_boost_round"] * .9,
                                    10,
                                    dtype=int)),
                    'skip_drop':
                    hp.uniform('skip_drop', 0.1, 0.7),
                },
                #  {'boosting_type': 'rf',
                #   'bagging_freq': 1,
                #   },
                #  {'boosting_type': 'goss',
                #   'bagging_freq': 0,
                #   },
            ]),
        #train params
        'early_stopping_rounds':
        hp.choice("early_stopping_rounds", [None, 50]),
        'cv_splits':
        hp.choice("cv_splits", np.linspace(3, 12, 10, dtype=int)),  # [4,8]
        'shuffle':
        hp.choice("shuffle", [True, False]),
    }

    if config.is_classification():
        space['scale_pos_weight'] = hp.uniform('scale_pos_weight', 0.5, 10)
    else:
        space['objective'] = hp.choice(
            "objective",
            [
                'regression',
                'huber',
                # 'fair',
                # 'regression_l1',
            ])

    def objective(space_sample):

        iteration_start = time.time()
        hyperparams = copy.deepcopy(space_sample)
        boosting_type = {}
        if 'boosting_type' in hyperparams.keys():
            boosting_type = hyperparams.pop('boosting_type')

        hyperparams = {**params, **hyperparams, **boosting_type}

        scores, models, y_oof = train_lightgbm_cv(data=data,
                                                  hyperparams=hyperparams,
                                                  config=config)

        if config.is_classification(): scores['oof'] = -scores['oof']

        iteration_time = time.time() - iteration_start
        log('iteration time %.1f, loss %.5f' % (iteration_time, scores['oof']))

        elapsed_time = (time.time() - config['start_time'])
        have_time = (config["time_limit"] - elapsed_time - iteration_time) > 25
        if have_time:
            save_model(models, hyperparams, scores, y_oof, stored_models_key,
                       save_to_disk, config)

            status = STATUS_OK
        else:
            status = STATUS_FAIL

        return {
            'loss': scores['oof'],
            'runtime': iteration_time,
            'scores': scores,
            'models': models,
            'y_oof': y_oof,
            'status': status
        }

    have_time = True
    eval_n = 0
    trials = Trials()

    while have_time:
        iteration_start = time.time()
        best = hyperopt.fmin(
            fn=objective,
            space=space,
            trials=trials,
            algo=tpe.suggest,
            max_evals=eval_n + 1,
            verbose=1,
            rstate=np.random.RandomState(eval_n)
        )  #TODO: (bug) if seed the same - in some cases it samples same values forever
        iteration_time = time.time() - iteration_start
        elapsed_time = (time.time() - config['start_time'])
        have_time = (config["time_limit"] - elapsed_time - iteration_time) > 25
        eval_n += 1
예제 #27
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config["model"] = {}
        self.config["ensemble"] = {"lgb": 1}

        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        # load holiday
        path_holiday = './holiday.csv'
        holiday = pd.read_csv(path_holiday, \
                      encoding='utf-8', low_memory=False, dtype={'holiday':str})['holiday'].values
        self.config['holiday'] = set(holiday)

        df = read_df(train_csv, self.config)
        print(df.shape)

        holiday_detect(df, self.config)

        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)

        train(X, y, self.config)

    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {"line_id": [], "prediction": []}
        if 'holiday_detect' in self.config:
            result["datetime"] = []

        for X in pd.read_csv(test_csv,
                             encoding="utf-8",
                             low_memory=False,
                             dtype=self.config["dtype"],
                             parse_dates=self.config["parse_dates"],
                             chunksize=self.config["nrows"]):
            result["line_id"] += list(X["line_id"])
            if 'holiday_detect' in self.config:
                dt_fea = self.config['holiday_detect']
                result["datetime"] += list(X[dt_fea])

            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)

        # post process for holiday
        if 'holiday_detect' in self.config:
            holiday = self.config['holiday']
            for idx, row in result.iterrows():
                dt = row['datetime']
                dt_str = str(dt).split(' ')[0].strip()
                if dt_str in holiday or dt.weekday() == 5 or dt.weekday() == 6:
                    result.loc[idx, 'prediction'] = 0

            result.drop(["datetime"], axis=1, inplace=True)

        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
예제 #28
0
def non_negative_target_detect(df: pd.DataFrame, config: Config):
    if config.is_train():
        config["non_negative_target"] = df["target"].lt(0).sum() == 0
예제 #29
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode

        self.config[
            "objective"] = "regression" if mode == "regression" else "binary"
        self.config["metric"] = "rmse" if mode == "regression" else "auc"

        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        df = preprocess(df, self.config)

        y = df["target"].copy()
        X = df.drop("target", axis=1).copy()
        del df
        gc.collect()

        self.config["columns"] = list(X)

        train(X, y, self.config)

    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        self.config["prediction_csv"] = prediction_csv
        self.config["line_id"] = []

        self.config["start_time"] = time.time()

        result = {
            "line_id": [],
            "prediction": [],
        }

        X = pd.read_csv(
            test_csv,
            encoding="utf-8",
            low_memory=False,
            dtype=self.config["dtype"],
            parse_dates=self.config["parse_dates"],
        )
        self.config["line_id"] = X["line_id"].values

        result["line_id"] = (X["line_id"].values)
        X = preprocess(X, self.config)

        X = X[self.config["columns"]]  # for right columns order

        result["prediction"] = predict(X, self.config)

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"],
                             self.config)
        else:
            score = None

        return result, score

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
예제 #30
0
class AutoML:
    def __init__(self, model_dir: str, params: dict, verbose: int=0):
        self.config = Config(model_dir, params, verbose=verbose)
        self.verbose=verbose

        if not 'memory' in self.config['params']:
            self.config['params']['memory'] = {}
        if not 'max_size_mb' in self.config['params']['memory']:
            self.config['params']['memory']['max_size_mb'] = 2
        if not 'max_size_train_samples' in self.config['params']['memory']:
            self.config['params']['memory']['max_size_train_samples'] = 10000
        if not 'field_target_name' in self.config['params']:
            self.config['params']['field_target_name'] = 'target'

    @timeit
    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config['stages_time'] = {}
        self.config.tmp_dir = os.path.join(self.config.model_dir, "tmp")
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        start_time = time.time()
        df = read_df(train_csv, self.config)
        stage_time_inc(self.config, start_time, 'train read_df')
        
        pipeline(df, self.config)
        
        if self.config.verbose:
            self.stages_time_print()
    
    @timeit
    def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):

        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.join(os.path.dirname(prediction_csv), "tmp")
        self.config['stages_time'] = {}
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {
            "line_id": [],
            "prediction": [],
        }

        start_time = time.time()
        
        for X in pd.read_csv(
                test_csv,
                encoding="utf-8",
                low_memory=False,
                dtype=self.config["dtype"],
                parse_dates=self.config["parse_dates"],
                chunksize=self.config["nrows"]
           ):
            stage_time_inc(self.config, start_time, 'test pd.read_csv')
            result["line_id"] += list(X["line_id"])
            pipeline(X, self.config)
            result["prediction"] += list(X[self.config['graph'][-1][0]])
            start_time = time.time()

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)
        stage_time_inc(self.config, start_time, 'result.to_csv')
        
        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            start_time = time.time()
            score = validate(result, target_csv, self.config["mode"], self.config.verbose)
            stage_time_inc(self.config, start_time, 'validate')
        else:
            score = None

        if self.config.verbose:
            self.stages_time_print()

        return result, score

    def stages_time_print(self, sort_by_time=True):
        if 'stages_time' in self.config.data.keys():
            d = self.config['stages_time']
            print('\n','-'*3, 'Pipeline stages time, sec:','-'*3)
            l_just = max([len(x) for x in d.keys()]) + 4
            if sort_by_time:
                for k, v in [(k, d[k]) for k in sorted(d, key=d.get, reverse=True)]:
                    print(k.replace('\n', '_').ljust(l_just), '{:<10} {:.2f}'.format(' ', v))
            else:
                for k, v in self.config['stages_time'].items():
                    print(k.replace('\n', '_').ljust(l_just), '{:<10} {:.2f}'.format(' ', v))
            print('-'*34, '\n')


    def pipeline_draw(self, file_name='AutoML_pipeline.gv', view=False):
        g = Digraph('G', filename=file_name)
        for i in self.config['graph']:
            g.edge(i[0], i[1])
    
        if view:
            g.view()
    
        return g

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
        self.config.verbose = self.verbose