def tester(pattern='R=1e-05_anti-kt'): X,y = np.load('data/npyfilesregression/Background_JEC_train_ID_preprocessed_{}.npy'.format(pattern)) print 'train data loaded' tf = create_tf_transform(X) print 'tf created' model = 'data/modelsregression/Model_{}.pickle'.format(pattern) X, y = np.load('data/npyfilesregression/Background_JEC_test_ID_preprocessed_{}.npy'.format(pattern)) print 'test data loaded' X = apply_tf_transform(X,tf) test(X, y, model,regression=True)
def test(filepath, modelpath, trainfilepath, rootfilepath, branchname, isSignal): X,y = np.load(trainfilepath) tf = create_tf_transform(X) X, y = np.load(filepath) X_tf = apply_tf_transform(X,tf) y_pred = predict(X_tf,modelpath, grnn_predict_gated, regression=False) testfile = TFile(rootfilepath,'update') testtree = testfile.Get('testtree') finaltree = testtree.CloneTree(0) finaltree.SetName('finaltree') branchval = np.zeros(1) finaltree.Branch(branchname, branchval, branchname+'/D') branchval2 = np.zeros(1) branchval2[0] = 1. if isSignal else 0. finaltree.Branch('isSignal', branchval2, 'isSignal/D') i = 0 for event in testtree: #here test if eta still aligned branchval[0] = y_pred[i] i+=1 finaltree.Fill() finaltree.Write()
def train(filename_train, filename_model, regression=False, simple=False, n_features=14, n_hidden=40, n_epochs=5, batch_size=64, step_size=0.0005, decay=0.9, random_state=42, verbose=False, statlimit=-1): # Initialization gated = not simple if verbose: logging.info("Calling with...") logging.info("\tfilename_train = %s" % filename_train) logging.info("\tfilename_model = %s" % filename_model) logging.info("\tgated = %s" % gated) logging.info("\tn_features = %d" % n_features) logging.info("\tn_hidden = %d" % n_hidden) logging.info("\tn_epochs = %d" % n_epochs) logging.info("\tbatch_size = %d" % batch_size) logging.info("\tstep_size = %f" % step_size) logging.info("\tdecay = %f" % decay) logging.info("\trandom_state = %d" % random_state) rng = check_random_state(random_state) # Make data if verbose: logging.info("Loading data...") if filename_train[-1] == "e": fd = open(filename_train, "rb") X, y = pickle.load(fd) fd.close() else: X, y = np.load(filename_train) X = np.array(X).astype(dict) y = np.array(y).astype(float) flush = np.random.permutation(len(X)) X, y = X[flush][:statlimit], y[flush][:statlimit] i = 0 ### delete single particles ### while i < len(X): if len(X[i]["content"]) == 1: X = np.delete(X, i) y = np.delete(y, i) else: i += 1 if regression: zerovalue = square_error(y, [x["pt"] for x in X]).mean() X = list(X) if verbose: logging.info("\tfilename = %s" % filename_train) logging.info("\tX size = %d" % len(X)) logging.info("\ty size = %d" % len(y)) # Preprocessing if verbose: logging.info("Preprocessing...") tf = create_tf_transform(X) X = apply_tf_transform(X, tf) # Split into train+validation logging.info("Splitting into train and validation...") X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=rng) del X del y # Training if verbose: logging.info("Training...") if gated: predict = grnn_predict_gated init = grnn_init_gated else: predict = grnn_predict_simple init = grnn_init_simple trained_params = init(n_features, n_hidden, random_state=rng) n_batches = int(np.ceil(len(X_train) / batch_size)) best_score = [np.inf] # yuck, but works best_params = [trained_params] def loss(X, y, params): y_pred = predict(params, X, regression=regression) if regression: l = square_error(y, y_pred).mean() else: l = log_loss(y, y_pred).mean() return l def objective(params, iteration): rng = check_random_state(iteration % n_batches) start = rng.randint(len(X_train) - batch_size) idx = slice(start, start + batch_size) return loss(X_train[idx], y_train[idx], params) def callback(params, iteration, gradient, regression=False): if iteration % 100 == 0: the_loss = loss(X_valid, y_valid, params) if the_loss < best_score[0]: best_score[0] = the_loss best_params[0] = copy.deepcopy(params) fd = open(filename_model, "wb") pickle.dump(best_params[0], fd) fd.close() if verbose: if regression: logging.info( "%5d\t~loss(train) = %.4f\tloss(valid) = %.4f" "\tbest_loss(valid) = %.4f" % (iteration, loss(X_train[:5000], y_train[:5000], params), loss(X_valid, y_valid, params), best_score[0])) else: roc_auc = roc_auc_score( y_valid, predict(params, X_valid, regression=regression)) logging.info( "%5d\t~loss(train) = %.4f\tloss(valid) = %.4f" "\troc_auc(valid) = %.4f\tbest_loss(valid) = %.4f" % (iteration, loss(X_train[:5000], y_train[:5000], params), loss( X_valid, y_valid, params), roc_auc, best_score[0])) for i in range(n_epochs): logging.info("epoch = %d" % i) logging.info("step_size = %.4f" % step_size) if regression: logging.info("zerovalue = %.4f" % zerovalue) trained_params = adam(ag.grad(objective), trained_params, step_size=step_size, num_iters=1 * n_batches, callback=callback) step_size = step_size * decay
sys.path.append("..") # In[]: basepath = '/data/conda/recnn/data' name="anti-kt" trainfile,testfile = basepath+"/npyfiles/subjet_oriented_"+name+"_train.npy",basepath+"/npyfiles/subjet_oriented_"+name+"_test.npy" modelpath = basepath+"/models/subjet_oriented_"+name+"_model.pickle" # In[]: ### Load training data ### X, y = np.load(trainfile) X=np.array(X).astype(dict) y = np.array(y).astype(int) ### to rescale test data ### tf = create_tf_transform(X,y) ### Load test data ### X1, y1 = np.load(testfile) X1 = np.array(X1).astype(dict) y1 = np.array(y1).astype(int) X1,y1=prepare_test_data(tf, X1, y1) # In[]: ### Build the roc ### r, f, t = build_roc(X1, y1, modelpath, func=grnn_predict_gated) print(r) # In[]: plt.plot(f,t,label=name) tpr,fpr = np.load('/data/conda/recnn/data/roccurves/standardID_ROC.npy')