class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "line_id": [], "prediction": [], } for X in pd.read_csv(test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"]): result["line_id"] += list(X["line_id"]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) logf('SCORE:', score) else: score = None return result, score @timeit def save(self): self.config.save() @timeit def load(self): self.config.load()
class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(test_csv, self.config) result = { "line_id": list(df["line_id"]), "prediction": [], } def chunker(seq, size): return (seq[pos:pos+size] for pos in range(0, len(seq), size)) for chunk in chunker(df, 100000): X = chunk.copy() preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.sort_values("line_id", inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score def save(self): self.config.save() def load(self): self.config.load()
class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config[ "objective"] = "regression" if mode == "regression" else "binary" self.config["metric"] = "rmse" if mode == "regression" else "auc" self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) df = preprocess(df, self.config) y = df["target"].copy() X = df.drop("target", axis=1).copy() del df gc.collect() self.config["columns"] = list(X) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) self.config["prediction_csv"] = prediction_csv self.config["line_id"] = [] self.config["start_time"] = time.time() result = { "line_id": [], "prediction": [], } X = pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], ) self.config["line_id"] = X["line_id"].values result["line_id"] = (X["line_id"].values) X = preprocess(X, self.config) X = X[self.config["columns"]] # for right columns order result["prediction"] = predict(X, self.config) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"], self.config) else: score = None return result, score @timeit def save(self): self.config.save() @timeit def load(self): self.config.load()
class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config["model"] = {} self.config["ensemble"] = {"lgb": 1} self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) # load holiday path_holiday = './holiday.csv' holiday = pd.read_csv(path_holiday, \ encoding='utf-8', low_memory=False, dtype={'holiday':str})['holiday'].values self.config['holiday'] = set(holiday) df = read_df(train_csv, self.config) print(df.shape) holiday_detect(df, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = {"line_id": [], "prediction": []} if 'holiday_detect' in self.config: result["datetime"] = [] for X in pd.read_csv(test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"]): result["line_id"] += list(X["line_id"]) if 'holiday_detect' in self.config: dt_fea = self.config['holiday_detect'] result["datetime"] += list(X[dt_fea]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) # post process for holiday if 'holiday_detect' in self.config: holiday = self.config['holiday'] for idx, row in result.iterrows(): dt = row['datetime'] dt_str = str(dt).split(' ')[0].strip() if dt_str in holiday or dt.weekday() == 5 or dt.weekday() == 6: result.loc[idx, 'prediction'] = 0 result.drop(["datetime"], axis=1, inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score @timeit def save(self): self.config.save() @timeit def load(self): self.config.load()
class AutoML: def __init__(self, model_dir: str, params: dict, verbose: int=0): self.config = Config(model_dir, params, verbose=verbose) self.verbose=verbose if not 'memory' in self.config['params']: self.config['params']['memory'] = {} if not 'max_size_mb' in self.config['params']['memory']: self.config['params']['memory']['max_size_mb'] = 2 if not 'max_size_train_samples' in self.config['params']['memory']: self.config['params']['memory']['max_size_train_samples'] = 10000 if not 'field_target_name' in self.config['params']: self.config['params']['field_target_name'] = 'target' @timeit def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config['stages_time'] = {} self.config.tmp_dir = os.path.join(self.config.model_dir, "tmp") os.makedirs(self.config.tmp_dir, exist_ok=True) start_time = time.time() df = read_df(train_csv, self.config) stage_time_inc(self.config, start_time, 'train read_df') pipeline(df, self.config) if self.config.verbose: self.stages_time_print() @timeit def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.join(os.path.dirname(prediction_csv), "tmp") self.config['stages_time'] = {} os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "line_id": [], "prediction": [], } start_time = time.time() for X in pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"] ): stage_time_inc(self.config, start_time, 'test pd.read_csv') result["line_id"] += list(X["line_id"]) pipeline(X, self.config) result["prediction"] += list(X[self.config['graph'][-1][0]]) start_time = time.time() result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) stage_time_inc(self.config, start_time, 'result.to_csv') target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): start_time = time.time() score = validate(result, target_csv, self.config["mode"], self.config.verbose) stage_time_inc(self.config, start_time, 'validate') else: score = None if self.config.verbose: self.stages_time_print() return result, score def stages_time_print(self, sort_by_time=True): if 'stages_time' in self.config.data.keys(): d = self.config['stages_time'] print('\n','-'*3, 'Pipeline stages time, sec:','-'*3) l_just = max([len(x) for x in d.keys()]) + 4 if sort_by_time: for k, v in [(k, d[k]) for k in sorted(d, key=d.get, reverse=True)]: print(k.replace('\n', '_').ljust(l_just), '{:<10} {:.2f}'.format(' ', v)) else: for k, v in self.config['stages_time'].items(): print(k.replace('\n', '_').ljust(l_just), '{:<10} {:.2f}'.format(' ', v)) print('-'*34, '\n') def pipeline_draw(self, file_name='AutoML_pipeline.gv', view=False): g = Digraph('G', filename=file_name) for i in self.config['graph']: g.edge(i[0], i[1]) if view: g.view() return g @timeit def save(self): self.config.save() @timeit def load(self): self.config.load() self.config.verbose = self.verbose