def loadModel(inputDir, trainData, model, LoadModel, sampleDatasets=None, removedVars=None): inputModel = '%s/KERAS_check_best_model.h5' % inputDir # inputModel = '%s/KERAS_model.h5'%inputDir inputWeights = '%s/KERAS_check_best_model_weights.h5' % inputDir from DeepJetCore.DataCollection import DataCollection traind = DataCollection() traind.readFromFile(trainData) traind.dataclass.regressiontargetclasses = range(0, NBINS) print traind.getNRegressionTargets() if (LoadModel): evalModel = load_model(inputModel, custom_objects=global_loss_list) shapes = traind.getInputShapes() else: shapes = traind.getInputShapes() train_inputs = [] for s in shapes: train_inputs.append(keras.layers.Input(shape=s)) evalModel = model(train_inputs, traind.getNClassificationTargets(), traind.getNRegressionTargets(), sampleDatasets, removedVars) evalModel.load_weights(inputWeights) return evalModel
def dcToDf(dc_file, df_out): dc = DataCollection() dc.readFromFile(dc_file) NENT = 1 # Can skip some events filelist = [] i = 0 storeInputs = True count = 0 feature_names = dc.dataclass.branches[1] spectator_names = dc.dataclass.branches[0] labels_names = dc.getUsedTruth() labels_names = ['truth' + l for l in labels_names] for s in dc.samples: if count > 1000000: break spath = dc.getSamplePath(s) filelist.append(spath) h5File = h5py.File(spath) f = h5File features_val_i = [ h5File['x%i' % j][()] for j in range(0, h5File['x_listlength'][()][0]) ] features_val_i = features_val_i[0][::NENT, 0, :] #predict_test_i = model.predict(features_val) weights_val_i = h5File['w0'][()] labels_val_i = h5File['y0'][()][::NENT, :] spectators_val_i = h5File['z0'][()][::NENT, 0, :] if storeInputs: raw_features_val_i = h5File['z1'][()][::NENT, 0, :] if i == 0: #predict_test = predict_test_i weights_val = weights_val_i labels_val = labels_val_i spectators_val = spectators_val_i features_val = features_val_i if storeInputs: raw_features_val = raw_features_val_i else: #predict_test = np.concatenate((predict_test,predict_test_i)) weights_val = np.concatenate((weights_val, weights_val_i)) labels_val = np.concatenate((labels_val, labels_val_i)) features_val = np.concatenate((features_val, features_val_i)) spectators_val = np.concatenate((spectators_val, spectators_val_i)) if storeInputs: raw_features_val = np.concatenate( (raw_features_val, raw_features_val_i)) i += 1 count += labels_val.shape[0] entries = np.hstack((raw_features_val, spectators_val, labels_val, weights_val.reshape((len(weights_val), 1)))) df = pd.DataFrame(entries, columns=feature_names + spectator_names + labels_names + ['weight']) #df = pd.DataFrame(raw_features_val+spectators_val , columns = feature_names+spectator_names) #print df if df_out != None: df.to_pickle(df_out) print "Saved df to", df_out
class TrainingInfo: def __init__( self, directory ): filename = os.path.join( directory, 'dataCollection.dc') file_ = open( filename, 'rb') self.samples = pickle.load(file_) sampleentries = pickle.load(file_) originRoots = pickle.load(file_) nsamples = pickle.load(file_) useweights = pickle.load(file_) batchsize = pickle.load(file_) dataclass = pickle.load(file_) weighter = pickle.load(file_) self._means = pickle.load(file_) file_.close() # Get means dictionary self.means = {name : (self._means[0][i], self._means[1][i]) for i, name in enumerate( self._means.dtype.names) } # Get DeepJetCore DataCollection self.dataCollection = DataCollection() self.dataCollection.readFromFile(filename) # Reading first sample & get branch structure fullpath = self.dataCollection.getSamplePath(self.samples[0]) self.dataCollection.dataclass.readIn(fullpath) self.branches = self.dataCollection.dataclass.branches print "Branches:" for i in range(len(self.branches)): print "Collection", i for i_b, b in enumerate(self.branches[i]): print " branch %2i/%2i %40s mean %8.5f var %8.5f" %( i, i_b, b, self.means[b][0], self.means[b][1]) print def dump( self, filename): pickle.dump( [ self.branches, self.means], file( filename, 'w' ) ) print "Written", filename
def loadModel(inputDir, trainData, model, LoadModel, sampleDatasets=None, removedVars=None, adv=False): inputModel = '%s/KERAS_check_best_model.h5' % inputDir from DeepJetCore.DataCollection import DataCollection traind = DataCollection() traind.readFromFile(trainData) traind.dataclass.regressiontargetclasses = range(0, NBINS) print(traind.getNRegressionTargets()) if (LoadModel): evalModel = load_model(inputModel, custom_objects=global_loss_list) shapes = traind.getInputShapes() else: shapes = traind.getInputShapes() train_inputs = [] for s in shapes: train_inputs.append(keras.layers.Input(shape=s)) modelargs = {} if adv: modelargs.update({ 'nRegTargets': NBINS, 'discTrainable': True, 'advTrainable': True }) evalModel = model(train_inputs, traind.getNClassificationTargets(), traind.getNRegressionTargets(), sampleDatasets, removedVars, **modelargs) evalModel.load_weights(inputModel) return evalModel
# encoding: utf-8 from argparse import ArgumentParser from DeepJetCore.DataCollection import DataCollection parser = ArgumentParser( 'convert a data collection to a single set of numpy arrays. Warning, this can produce a large output' ) parser.add_argument('inputDataCollection') parser.add_argument('outputFilePrefix') args = parser.parse_args() print('reading data collection') dc = DataCollection() dc.readFromFile(args.inputDataCollection) print('producing feature array') feat = dc.getAllFeatures() print('producing truth array') truth = dc.getAllLabels() print('producing weight array') weight = dc.getAllWeights() print('producing means and norms array') means = dc.means from numpy import save
LoadModel = False removedVars = None forceNClasses = False signals = [1] sigNames = ['Hbb'] backgrounds = [0] backNames = ['QCD'] NClasses = len(signals) + len(backgrounds) if True: evalModel = loadModel(trainDir, inputTrainDataCollection, trainingModel, LoadModel, forceNClasses, NClasses, inputDataset, removedVars) evalDir = opts.o from DeepJetCore.DataCollection import DataCollection testd = DataCollection() testd.readFromFile(inputTestDataCollection) if os.path.isdir(evalDir): raise Exception('output directory: %s must not exists yet' % evalDir) else: os.mkdir(evalDir) df, features_val = makePlots(testd, evalModel, evalDir) makeLossPlot(trainDir, evalDir) #df = evaluate(testd, inputTrainDataCollection, evalModel, evalDir) #make_plots(evalDir, savedir='Plots')
from DeepJetCore.DataCollection import DataCollection from pprint import pprint dc = DataCollection() dc.readFromFile( 'dc/dataCollection.dc' ) #/storage/9/dseith/DeepJet/deepCSV/results/../../Ntuples/Thu_135917_batch/dataCollections/deepCSV/train/dataCollection.dc') #dc.readFromFile('/storage/9/dseith/DeepJet/deepCSV/results/../../Ntuples/Thu_135917_batch/dataCollections/deepFlavour_FT_reg/train/dataCollection.dc') #pprint (dc.means[0]) #print '-'*100 #pprint (dc.means[1]) #print '-'*100 #pprint (dc.means.dtype.names) #pprint (dc.means[0][0].dtype) #pprint (dc.useweights) #pprint (dc.weighter) #pprint (dc.samples) #pprint (dc.sampleentries) #pprint (dc.originRoots) #pprint (dc.nsamples) #pprint (dc.useweights) ##pprint (dc.__batchsize) pprint(dc.dataclass) #pprint (dc.weighter) #pprint (dc.means) six_times = [ 'TagVarCSVTrk_trackJetDistVal', 'TagVarCSVTrk_trackPtRel', 'TagVarCSVTrk_trackDeltaR', 'TagVarCSVTrk_trackPtRatio', 'TagVarCSVTrk_trackSip3dSig', 'TagVarCSVTrk_trackSip2dSig', 'TagVarCSVTrk_trackDecayLenVal'
raise Exception('output directory must not exists yet') custom_objs = {} custom_objs.update(global_loss_list) custom_objs.update(global_layers_list) model=load_model(args.inputModel, custom_objects=custom_objs) td=testDescriptor() if args.use: td.use_only = [int(i) for i in args.use.split(',')] from DeepJetCore.DataCollection import DataCollection testd=DataCollection() testd.readFromFile(args.inputDataCollection) os.mkdir(args.outputDir) td.makePrediction( model, testd, args.outputDir, store_labels = args.labels, monkey_class = args.monkey_class ) td.writeToTextFile(args.outputDir+'/tree_association.txt') # make the file reading entirely C++ # then it can be used for other studies
def evaluate(testd, trainData, model, outputDir, storeInputs=False, adv=False): NENT = 1 # Can skip some events filelist = [] i = 0 for s in testd.samples: #for s in testd.samples[0:1]: spath = testd.getSamplePath(s) filelist.append(spath) h5File = h5py.File(spath) f = h5File #features_val = [h5File['x%i_shape'%j][()] for j in range(0, h5File['x_listlength'][()][0])] features_val = [ h5File['x%i' % j][()] for j in range(0, h5File['x_listlength'][()][0]) ] #features_val=testd.getAllFeatures() predict_test_i = model.predict(features_val) labels_val_i = h5File['y0'][()][::NENT, :] spectators_val_i = h5File['z0'][()][::NENT, 0, :] if storeInputs: raw_features_val_i = h5File['z1'][()][::NENT, 0, :] if i == 0: predict_test = predict_test_i labels_val = labels_val_i spectators_val = spectators_val_i if storeInputs: raw_features_val = raw_features_val_i else: predict_test = np.concatenate((predict_test, predict_test_i)) labels_val = np.concatenate((labels_val, labels_val_i)) spectators_val = np.concatenate((spectators_val, spectators_val_i)) if storeInputs: raw_features_val = np.concatenate( (raw_features_val, raw_features_val_i)) i += 1 # Value #labels_val=testd.getAllLabels()[0][::NENT,:] #features_val=testd.getAllFeatures()[0][::NENT,0,:] #spectators_val = testd.getAllSpectators()[0][::NENT,0,:] #if storeInputs: raw_features_val = testd.getAllSpectators()[-1][::NENT,0,:] # Labels print testd.dataclass.branches feature_names = testd.dataclass.branches[1] spectator_names = testd.dataclass.branches[0] #truthnames = testd.getUsedTruth() from DeepJetCore.DataCollection import DataCollection traind = DataCollection() traind.readFromFile(trainData) truthnames = traind.getUsedTruth() # Store features print "Coulmns", spectator_names df = pd.DataFrame(spectators_val, columns=spectator_names) if storeInputs: for i, tname in enumerate(feature_names): df[tname] = raw_features_val[:, i] # Add predictions print truthnames print predict_test.shape for i, tname in enumerate(truthnames): df['truth' + tname] = labels_val[:, i] #print "Mean 0th label predict predict of ", tname, np.mean(predict_test[:,0]), ", Stats:", np.sum(labels_val[:,i]), "/", len(labels_val[:,i]) if adv: df['predict' + tname] = predict_test[:, NBINS + i] for j in range(NBINS): df['predict_massbin%i' % j] = predict_test[:, j + i] else: df['predict' + tname] = predict_test[:, i] print "Testing prediction:" print "Total: ", len(predict_test[:, 0]) for lab in truthnames: print lab, ":", sum(df['truth' + lab].values) df.to_pickle(outputDir + '/output.pkl') #to save the dataframe, df to 123.pkl return df print "Finished storing dataframe"
def evaluate(testd, trainData, model, outputDir): NENT = 1 # Can skip some events filelist = [] i = 0 for s in testd.samples: spath = testd.getSamplePath(s) filelist.append(spath) h5File = h5py.File(spath) f = h5File #features_val = [h5File['x%i_shape'%j][()] for j in range(0, h5File['x_listlength'][()][0])] features_val = [ h5File['x%i' % j][()] for j in range(0, h5File['x_listlength'][()][0]) ] #features_val=testd.getAllFeatures() predict_test_i = model.predict(features_val) if i == 0: predict_test = predict_test_i else: predict_test = np.concatenate((predict_test, predict_test_i)) i += 1 # Value labels_val = testd.getAllLabels()[0][::NENT, :] features_val = testd.getAllFeatures()[0][::NENT, 0, :] spectators_val = testd.getAllSpectators()[0][::NENT, 0, :] raw_features_val = testd.getAllSpectators()[-1][::NENT, 0, :] # Labels print testd.dataclass.branches feature_names = testd.dataclass.branches[1] spectator_names = testd.dataclass.branches[0] #truthnames = testd.getUsedTruth() from DeepJetCore.DataCollection import DataCollection traind = DataCollection() traind.readFromFile(trainData) truthnames = traind.getUsedTruth() # Store features df = pd.DataFrame(spectators_val) df.columns = [spectator_names] for i, tname in enumerate(feature_names): df[tname] = raw_features_val[:, i] # Add predictions print truthnames print predict_test.shape for i, tname in enumerate(truthnames): df['truth' + tname] = labels_val[:, i] df['predict' + tname] = predict_test[:, i] df.to_pickle(outputDir + '/output.pkl') #to save the dataframe, df to 123.pkl print df dt = pd.read_pickle(outputDir + '/output.pkl') print dt def dists(xdf, truthnames): truths = truthnames print truths def distribution(xdf, predict="Hcc"): plt.figure(figsize=(10, 7)) bins = np.linspace(0, 1, 70) trus = [] for tru in truths: trus.append(xdf['truth' + tru].values) preds = [xdf['predict' + predict].values] * len(truths) plt.hist(preds, bins=bins, weights=trus, alpha=0.8, normed=True, label=truths, stacked=True) plt.xlabel("Probability " + predict) plt.title("Stacked Distributions") plt.semilogy() plt.legend(title="True labels:") plt.savefig(outputDir + '/dist' + predict + '.png', dpi=300) for pred in truths: distribution(xdf, predict=pred) dists(df, truthnames) print "Testing prediction:" print "Total: ", len(predict_test[:, 0]) for lab in truthnames: print lab, ":", sum(df['truth' + lab].values) print "Finished"
def train(self): placeholder_input, placeholder_output = self.model.get_placeholders() graph_output = self.model.get_compute_graphs() graph_loss = self.model.get_losses() graph_optmiser = self.model.get_optimizer() graph_summary = self.model.get_summary() if self.from_scratch: self.clean_summary_dir() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) if self.use_tf_records: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) record_batch_input, record_batch_target = self.get_record_placeholders( ) else: input_data = self.config['train_data_path'] train_data = DataCollection() train_data.readFromFile(input_data) val_data = train_data.split(0.1) train_data = train_data.split(0.9) train_data.setBatchSize(self.batch_size) val_data.setBatchSize(self.batch_size) val_data_generator = train_data.generator() train_data_generator = train_data.generator() summary_writer = tf.summary.FileWriter(self.summary_path, sess.graph) if not self.from_scratch: self.saver_all.restore(sess, self.model_path) print("\n\nINFO: Loading model\n\n") with open(self.model_path + '.txt', 'r') as f: iteration_number = int(f.read()) else: iteration_number = 0 print("Starting iterations") while iteration_number < self.train_for_iterations: if self.use_tf_records: input, output = sess.run( [record_batch_input, record_batch_target]) input = [ np.fromstring(''.join(i)).reshape( 13, 13, int(self.config['num_layers']), int(self.config['num_channels'])) for i in input ] output = [ np.fromstring(''.join(i)).reshape( 13, 13, int(self.config['num_layers'])) for i in output ] else: input, output, _ = train_data_generator.next() input = np.squeeze(input, axis=0) output = np.squeeze(output, axis=0) _, eval_loss, _, eval_summary = sess.run( [graph_output, graph_loss, graph_optmiser, graph_summary], feed_dict={ placeholder_input: input, placeholder_output: output }) print("Iteration %4d: loss %0.5f" % (iteration_number, eval_loss)) iteration_number += 1 summary_writer.add_summary(eval_summary, iteration_number) if iteration_number % self.save_after_iterations == 0: print("\n\nINFO: Saving model\n\n") self.saver_all.save(sess, self.model_path) with open(self.model_path + '.txt', 'w') as f: f.write(str(iteration_number)) if self.use_tf_records: # Stop the threads coord.request_stop() # Wait for threads to stop coord.join(threads)