def main(): prog = "train_ember" descr = "Train an ember model from a directory with raw feature files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("-v", "--featureversion", type=int, default=2, help="EMBER feature version") parser.add_argument("-m", "--metadata", action="store_true", help="Create metadata CSVs") parser.add_argument("-t", "--train", action="store_true", help="Train an EMBER model") parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features") parser.add_argument("--optimize", help="gridsearch to find best parameters", action="store_true") args = parser.parse_args() if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir): parser.error("{} is not a directory with raw feature files".format( args.datadir)) X_train_path = os.path.join(args.datadir, "X_train.dat") y_train_path = os.path.join(args.datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(args.datadir, args.featureversion) if args.metadata: ember.create_metadata(args.datadir) if args.train: params = { "boosting": "gbdt", "objective": "binary", "num_iterations": 1000, "learning_rate": 0.05, "num_leaves": 2048, "max_depth": 15, "min_data_in_leaf": 50, "feature_fraction": 0.5 } if args.optimize: params = ember.optimize_model(args.datadir) print("Best parameters: ") print(json.dumps(params, indent=2)) print("Training LightGBM model") lgbm_model = ember.train_model(args.datadir, params, args.featureversion) lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))
# In[3]: X_train_path = os.path.join(data_dir, f"X_train_{feature_version}.dat") y_train_path = os.path.join(data_dir, f"y_train_{feature_version}.dat") # if they don't exist, compute them. if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("creating vectorized features ....") ember.create_vectorized_features(data_dir, feature_version=feature_version) metadata_path = os.path.join(data_dir, f"metadata_{feature_version}.csv") if not os.path.exists(metadata_path): print("creating metadata ....") _ = ember.create_metadata(data_dir, feature_version=feature_version) # In[4]: emberdf = ember.read_metadata(data_dir) X_train, y_train, X_test, y_test = ember.read_vectorized_features(data_dir, feature_version=feature_version) print("loading model ....") f = open(os.path.join(data_dir,f"SGDR_model_{feature_version}.pkl"), "rb") lgbm_model = pickle.load(f) f.close() # In[5]:
import ember # ember hard codes the size of data. # it's best to just let it convert all 1.1M points. This only takes like 30 minutes # only have to do this once. # data points are stored in the data directory and are mmaped into memory when training ember.create_vectorized_features("./data/ember/") ember.create_metadata("./data/ember/")
# -*- coding: utf-8 -*- """ Created on Thu Feb 28 12:42:09 2019 @author: Piyush """ import ember ember.create_vectorized_features('D:/ml_projects/datasets/ember') ember.create_metadata('D:/ml_projects/datasets/ember')
def create_vectorized_ember(): ember.create_vectorized_features("./data/ember2018/") ember.create_metadata("./data/ember2018/")