예제 #1
0
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config):
    params = {
        "objective": "regression" if config.is_regression() else "binary",
        "metric": "rmse" if config.is_regression() else "auc",
        "verbosity": -1,
        "seed": 1,
    }

    X_sample, y_sample = data_sample(X, y, config, nrows=20000)
    hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config)

    X_train, X_val, y_train, y_val = data_split(X, y, config)

    config["model"] = lgb.train(
        {**params, **hyperparams},
        lgb.Dataset(X_train, label=y_train),
        5000,
        lgb.Dataset(X_val, label=y_val),
        early_stopping_rounds=100,
        verbose_eval=100,
    )
    config.save()

    try:
        with time_limit(config.time_left() - 10):
            config["model"] = lgb.train(
                {**params, **hyperparams},
                lgb.Dataset(X, label=y),
                int(1.2 * config["model"].best_iteration),
            )
    except TimeoutException:
        Log.print("Timed out!")
예제 #2
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)

        train(X, y, self.config)

    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {
            "line_id": [],
            "prediction": [],
        }

        for X in pd.read_csv(test_csv,
                             encoding="utf-8",
                             low_memory=False,
                             dtype=self.config["dtype"],
                             parse_dates=self.config["parse_dates"],
                             chunksize=self.config["nrows"]):
            result["line_id"] += list(X["line_id"])
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
            logf('SCORE:', score)
        else:
            score = None

        return result, score

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
예제 #3
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)
        train(X, y, self.config)

    def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(test_csv, self.config)
        result = {
            "line_id": list(df["line_id"]),
            "prediction": [],
        }

        def chunker(seq, size):
            return (seq[pos:pos+size] for pos in range(0, len(seq), size))

        for chunk in chunker(df, 100000):
            X = chunk.copy()
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.sort_values("line_id", inplace=True)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score

    def save(self):
        self.config.save()

    def load(self):
        self.config.load()
예제 #4
0
# # save data as pickle
# with open('preprocessed_data.pickle', 'wb') as f:
#     pickle.dump([ko_corpus, ko_vocab, en_corpus, en_vocab], f, protocol=pickle.HIGHEST_PROTOCOL)
#
# assert all([ko_corpus, ko_vocab, en_corpus, en_vocab])

# define lm model
if mconf.model == 'transformers':
    model = Transformer(mconf.d_m,
                        len(ko_vocab) + 1,
                        len(en_vocab) + 1,
                        mconf.d_m * 8,
                        n=6)
else:
    model = BiLSTMSeq2Seq(
        len(ko_vocab) + 1,
        len(en_vocab) + 1, mconf.emb_dim, mconf.d_m)
model.to(device)

# load transformers and train
lm = LangTranslator(model, ko_vocab, en_vocab, dconf, mconf, device)
lm.train(ko_corpus, en_corpus)

# save model
lm.save('trained_keep.pth')
mconf.save(mconf_path)

test = ['또 하나 필요한 것은 훌륭한 영어 실력이다.', '경찰은 월요일 밤 집무실을 찾아 증거를 압수했다.']
print(lm.translate(test))
예제 #5
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config["model"] = {}
        self.config["ensemble"] = {"lgb": 1}

        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        # load holiday
        path_holiday = './holiday.csv'
        holiday = pd.read_csv(path_holiday, \
                      encoding='utf-8', low_memory=False, dtype={'holiday':str})['holiday'].values
        self.config['holiday'] = set(holiday)

        df = read_df(train_csv, self.config)
        print(df.shape)

        holiday_detect(df, self.config)

        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)

        train(X, y, self.config)

    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {"line_id": [], "prediction": []}
        if 'holiday_detect' in self.config:
            result["datetime"] = []

        for X in pd.read_csv(test_csv,
                             encoding="utf-8",
                             low_memory=False,
                             dtype=self.config["dtype"],
                             parse_dates=self.config["parse_dates"],
                             chunksize=self.config["nrows"]):
            result["line_id"] += list(X["line_id"])
            if 'holiday_detect' in self.config:
                dt_fea = self.config['holiday_detect']
                result["datetime"] += list(X[dt_fea])

            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)

        # post process for holiday
        if 'holiday_detect' in self.config:
            holiday = self.config['holiday']
            for idx, row in result.iterrows():
                dt = row['datetime']
                dt_str = str(dt).split(' ')[0].strip()
                if dt_str in holiday or dt.weekday() == 5 or dt.weekday() == 6:
                    result.loc[idx, 'prediction'] = 0

            result.drop(["datetime"], axis=1, inplace=True)

        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
예제 #6
0
class AutoML:
    def __init__(self, model_dir: str):
        os.makedirs(model_dir, exist_ok=True)
        self.config = Config(model_dir)

    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode

        self.config[
            "objective"] = "regression" if mode == "regression" else "binary"
        self.config["metric"] = "rmse" if mode == "regression" else "auc"

        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        df = preprocess(df, self.config)

        y = df["target"].copy()
        X = df.drop("target", axis=1).copy()
        del df
        gc.collect()

        self.config["columns"] = list(X)

        train(X, y, self.config)

    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        self.config["prediction_csv"] = prediction_csv
        self.config["line_id"] = []

        self.config["start_time"] = time.time()

        result = {
            "line_id": [],
            "prediction": [],
        }

        X = pd.read_csv(
            test_csv,
            encoding="utf-8",
            low_memory=False,
            dtype=self.config["dtype"],
            parse_dates=self.config["parse_dates"],
        )
        self.config["line_id"] = X["line_id"].values

        result["line_id"] = (X["line_id"].values)
        X = preprocess(X, self.config)

        X = X[self.config["columns"]]  # for right columns order

        result["prediction"] = predict(X, self.config)

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"],
                             self.config)
        else:
            score = None

        return result, score

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
예제 #7
0
class AutoML:
    def __init__(self, model_dir: str, params: dict, verbose: int=0):
        self.config = Config(model_dir, params, verbose=verbose)
        self.verbose=verbose

        if not 'memory' in self.config['params']:
            self.config['params']['memory'] = {}
        if not 'max_size_mb' in self.config['params']['memory']:
            self.config['params']['memory']['max_size_mb'] = 2
        if not 'max_size_train_samples' in self.config['params']['memory']:
            self.config['params']['memory']['max_size_train_samples'] = 10000
        if not 'field_target_name' in self.config['params']:
            self.config['params']['field_target_name'] = 'target'

    @timeit
    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config['stages_time'] = {}
        self.config.tmp_dir = os.path.join(self.config.model_dir, "tmp")
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        start_time = time.time()
        df = read_df(train_csv, self.config)
        stage_time_inc(self.config, start_time, 'train read_df')
        
        pipeline(df, self.config)
        
        if self.config.verbose:
            self.stages_time_print()
    
    @timeit
    def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):

        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.join(os.path.dirname(prediction_csv), "tmp")
        self.config['stages_time'] = {}
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {
            "line_id": [],
            "prediction": [],
        }

        start_time = time.time()
        
        for X in pd.read_csv(
                test_csv,
                encoding="utf-8",
                low_memory=False,
                dtype=self.config["dtype"],
                parse_dates=self.config["parse_dates"],
                chunksize=self.config["nrows"]
           ):
            stage_time_inc(self.config, start_time, 'test pd.read_csv')
            result["line_id"] += list(X["line_id"])
            pipeline(X, self.config)
            result["prediction"] += list(X[self.config['graph'][-1][0]])
            start_time = time.time()

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)
        stage_time_inc(self.config, start_time, 'result.to_csv')
        
        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            start_time = time.time()
            score = validate(result, target_csv, self.config["mode"], self.config.verbose)
            stage_time_inc(self.config, start_time, 'validate')
        else:
            score = None

        if self.config.verbose:
            self.stages_time_print()

        return result, score

    def stages_time_print(self, sort_by_time=True):
        if 'stages_time' in self.config.data.keys():
            d = self.config['stages_time']
            print('\n','-'*3, 'Pipeline stages time, sec:','-'*3)
            l_just = max([len(x) for x in d.keys()]) + 4
            if sort_by_time:
                for k, v in [(k, d[k]) for k in sorted(d, key=d.get, reverse=True)]:
                    print(k.replace('\n', '_').ljust(l_just), '{:<10} {:.2f}'.format(' ', v))
            else:
                for k, v in self.config['stages_time'].items():
                    print(k.replace('\n', '_').ljust(l_just), '{:<10} {:.2f}'.format(' ', v))
            print('-'*34, '\n')


    def pipeline_draw(self, file_name='AutoML_pipeline.gv', view=False):
        g = Digraph('G', filename=file_name)
        for i in self.config['graph']:
            g.edge(i[0], i[1])
    
        if view:
            g.view()
    
        return g

    @timeit
    def save(self):
        self.config.save()

    @timeit
    def load(self):
        self.config.load()
        self.config.verbose = self.verbose