Exemplo n.º 1
0
def main():
    prog = "train_ember"
    descr = "Train an ember model from a directory with raw feature files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("-v",
                        "--featureversion",
                        type=int,
                        default=2,
                        help="EMBER feature version")
    parser.add_argument("-m",
                        "--metadata",
                        action="store_true",
                        help="Create metadata CSVs")
    parser.add_argument("-t",
                        "--train",
                        action="store_true",
                        help="Train an EMBER model")
    parser.add_argument("datadir",
                        metavar="DATADIR",
                        type=str,
                        help="Directory with raw features")
    parser.add_argument("--optimize",
                        help="gridsearch to find best parameters",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
        parser.error("{} is not a directory with raw feature files".format(
            args.datadir))

    X_train_path = os.path.join(args.datadir, "X_train.dat")
    y_train_path = os.path.join(args.datadir, "y_train.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features")
        ember.create_vectorized_features(args.datadir, args.featureversion)

    if args.metadata:
        ember.create_metadata(args.datadir)

    if args.train:
        params = {
            "boosting": "gbdt",
            "objective": "binary",
            "num_iterations": 1000,
            "learning_rate": 0.05,
            "num_leaves": 2048,
            "max_depth": 15,
            "min_data_in_leaf": 50,
            "feature_fraction": 0.5
        }
        if args.optimize:
            params = ember.optimize_model(args.datadir)
            print("Best parameters: ")
            print(json.dumps(params, indent=2))

        print("Training LightGBM model")
        lgbm_model = ember.train_model(args.datadir, params,
                                       args.featureversion)
        lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))
Exemplo n.º 2
0

# In[3]:


X_train_path = os.path.join(data_dir, f"X_train_{feature_version}.dat")
y_train_path = os.path.join(data_dir, f"y_train_{feature_version}.dat")
# if they don't exist, compute them.
if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
    print("creating vectorized features ....")
    ember.create_vectorized_features(data_dir, feature_version=feature_version)

metadata_path = os.path.join(data_dir, f"metadata_{feature_version}.csv")
if not os.path.exists(metadata_path):
    print("creating metadata ....")
    _ = ember.create_metadata(data_dir, feature_version=feature_version)


# In[4]:


emberdf = ember.read_metadata(data_dir)
X_train, y_train, X_test, y_test = ember.read_vectorized_features(data_dir, feature_version=feature_version)

print("loading model ....")
f = open(os.path.join(data_dir,f"SGDR_model_{feature_version}.pkl"), "rb")
lgbm_model = pickle.load(f)
f.close()


# In[5]:
Exemplo n.º 3
0
import ember

# ember hard codes the size of data.
# it's best to just let it convert all 1.1M points. This only takes like 30 minutes
# only have to do this once.
# data points are stored in the data directory and are mmaped into memory when training
ember.create_vectorized_features("./data/ember/")
ember.create_metadata("./data/ember/")
Exemplo n.º 4
0
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 28 12:42:09 2019

@author: Piyush
"""

import ember

ember.create_vectorized_features('D:/ml_projects/datasets/ember')
ember.create_metadata('D:/ml_projects/datasets/ember')
Exemplo n.º 5
0
def create_vectorized_ember():
    ember.create_vectorized_features("./data/ember2018/")
    ember.create_metadata("./data/ember2018/")