def main(): parser = argparse.ArgumentParser() parser.add_argument("-d", "--datadir", help="Features Directory", type=str) parser.add_argument("-o", "--output", help="output Directory", type=str) args = parser.parse_args() if not os.path.exists(args.datadir): parser.error("{} is not a directory".format(args.datadir)) if not os.path.exists(args.output): os.mkdir(args.output) #Get total lines from feature.jsonl rows = 0 with jsonlines.open(os.path.join(args.datadir, 'features.jsonl')) as reader: for obj in reader.iter(type=dict, skip_invalid=True): rows += 1 clear(args.datadir) ember.create_vectorized_features(args.datadir, rows) # Train and save model print("Training LightGBM model") lgbm_model = ember.train_model(args.datadir, rows) lgbm_model.save_model(os.path.join(args.output, "model.txt"))
def main(): prog = "train_ember" descr = "Train an ember model from a directory with raw feature files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("-v", "--featureversion", type=int, default=2, help="EMBER feature version") parser.add_argument("-m", "--metadata", action="store_true", help="Create metadata CSVs") parser.add_argument("-t", "--train", action="store_true", help="Train an EMBER model") parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features") parser.add_argument("--optimize", help="gridsearch to find best parameters", action="store_true") args = parser.parse_args() if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir): parser.error("{} is not a directory with raw feature files".format( args.datadir)) X_train_path = os.path.join(args.datadir, "X_train.dat") y_train_path = os.path.join(args.datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(args.datadir, args.featureversion) if args.metadata: ember.create_metadata(args.datadir) if args.train: params = { "boosting": "gbdt", "objective": "binary", "num_iterations": 1000, "learning_rate": 0.05, "num_leaves": 2048, "max_depth": 15, "min_data_in_leaf": 50, "feature_fraction": 0.5 } if args.optimize: params = ember.optimize_model(args.datadir) print("Best parameters: ") print(json.dumps(params, indent=2)) print("Training LightGBM model") lgbm_model = ember.train_model(args.datadir, params, args.featureversion) lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))
def main(): datadir = '/home/mira/research/dataset/ember.2' if not os.path.exists(datadir) or not os.path.isdir(datadir): print("not a path") X_train_path = os.path.join(datadir, "X_train.dat") y_train_path = os.path.join(datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("[{}] Creating vectorized features".format( datetime.datetime.now())) ember.create_vectorized_features(datadir) print("[{}] Training LightGBM model".format(datetime.datetime.now())) lgbm_model = ember.train_model(datadir) lgbm_model.save_model(os.path.join(datadir, "model.txt")) print("[{}] Done".format(datetime.datetime.now()))
def train_multiple(data_dir): """ Train a bunch of models to explore how different they are """ params = { "boosting": "gbdt", "objective": "binary", "num_iterations": 1000, "learning_rate": 0.05, "num_leaves": 2048, "feature_fraction": 0.5, "bagging_fraction": 1.0, "max_depth": 15, "min_data_in_leaf": 50 } for i in range(10): lgbm_model = ember.train_model(data_dir, params, 2) lgbm_model.save_model( os.path.join(data_dir, f"ember_model_2018_random{i}.txt"))
def main(): prog = "train_ember" descr = "Train an ember model from a directory with raw feature files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features") args = parser.parse_args() if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir): parser.error("{} is not a directory with raw feature files".format(args.datadir)) X_train_path = os.path.join(args.datadir, "X_train.dat") y_train_path = os.path.join(args.datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(args.datadir) print("Training LightGBM model") lgbm_model = ember.train_model(args.datadir) lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))
def trainModel(self, vectorizedDataDir): self.lgbm_model = ember.train_model(vectorizedDataDir) return self.lgbm_model
y_train[train_rows], epochs=3, verbose=2, validation_data=(x_test, y_test)) # In[ ]: y_binary = to_categorical(y_test) print(y_binary.shape) # In[ ]: # EMBER model params = { "boosting": "gbdt", "objective": "binary", "num_iterations": 1000, "learning_rate": 0.05, "num_leaves": 2048, "max_depth": 15, "min_data_in_leaf": 50, "feature_fraction": 0.5 } print("training lightGBM model") lgbm_model = ember.train_model(datadir, params, 2) lgbm_model.save_model(os.path.join(datadir, "model.txt")) # In[ ]:
def main(): prog = "train_ember" descr = "Train an ember model from a directory with raw feature files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("--modelname", type=str, default="SGD", help="Model name") parser.add_argument("-v", "--featureversion", type=int, default=2, help="EMBER feature version") parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features") parser.add_argument("--optimize", help="gridsearch to find best parameters", action="store_true") args = parser.parse_args() if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir): parser.error("{} is not a directory with raw feature files".format( args.datadir)) X_train_path = os.path.join(args.datadir, f"X_train_{args.featureversion}.dat") y_train_path = os.path.join(args.datadir, f"y_train_{args.featureversion}.dat") # if they don't exist, compute them. if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(args.datadir, args.featureversion) #feature_name = ['feature_' + str(col) for col in range(num_feature)] params = { "boosting": "gbdt", "objective": "regression", "num_iterations": 1000, "learning_rate": 0.05, "num_leaves": 2048, "max_depth": 15, "min_data_in_leaf": 50, "feature_fraction": 0.5, "num_threads": 2, } if args.optimize: params = ember.optimize_model(args.datadir) print("Best parameters: ") print(json.dumps(params, indent=2)) print("Training Classifier model") lgbm_model = ember.train_model(args.datadir, params, args.featureversion) # Save to file in the current working directory #pkl_filename = os.path.join(args.datadir,f"{args.modelname}_model_{args.featureversion}.pkl") # with open(pkl_filename, 'wb') as f: # pickle.dump(lgbm_model, f) print(f"file dumped into model.txt .... ") lgbm_model.save_model( os.path.join(args.datadir, f"model_{args.featureversion}.txt")) print('Plotting feature importances...') ax = lgb.plot_importance(lgbm_model, max_num_features=10) plt.savefig(f'lgbm_importances-0{args.featureversion}.png') # run os.system(f"xdg-open lgbm_importances-0{args.featureversion}.png")