def load_test(tf, filename_test, preprocess=None): # Make test data print("Loading test data...") X, y = np.load(filename_test) X = np.array(X).astype(dict) y = np.array(y).astype(int) print("\tfilename = " + filename_test) print("\tX size = ") print(len(X)) print("\ty size = ") print(len(y)) # Preprocessing print("Preprocessing...") X = multithreadmap(rewrite_content,X) if preprocess: X = multithreadmap(preprocess,X) X = multithreadmap(permute_by_pt,X) X = multithreadmap(extract,X) X=multithreadmap(tftransform,X,tf=tf) i=0 while i < len(y): if X[i]['tree'].shape == (1, 2): X,y=np.delete(X,i),np.delete(y,i) else : i+=1 return(X, y)
def load_tf(filename_train, preprocess=None, n_events_train=-1): # Make training data print("Loading training data...") X, y = np.load(filename_train) X=np.array(X).astype(dict) y = np.array(y).astype(int) if n_events_train > 0: indices = np.random.permutation(len(X))[:n_events_train] X = X[indices] y = y[indices] print("\tfilename = " + filename_train) print("\tX size = ") print(len(X)) print("\ty size = ") print(len(y)) # Preprocessing print("Preprocessing...") X = multithreadmap(rewrite_content,X) if preprocess: X = multithreadmap(preprocess,X) X = multithreadmap(permute_by_pt,multithreadmap(extract,X)) Xcontent=multithreadmap(extractcontent,X) tf = RobustScaler().fit(np.vstack(Xcontent)) return(tf)
def cleanarray(jets_array, addID=False): indexes = multithreadmap(find_first_non_particle, jets_array) jets_array = list(jets_array) for i in range(len(jets_array)): jets_array[i] = jets_array[i][:indexes[i]] jets_array = multithreadmap(select_particle_features, jets_array, addID=addID) return jets_array
def preprocess_for_training(filename,regression = False,R_clustering = 0.3,issignal = True, tosavefilename=''): events = np.array(np.load(filename)) signal = multithreadmap(create_jet_dictionary,events,cluster = cluster,regression = regression,R = 1000.) X = np.array(signal) if regression : y = np.array(multithreadmap(extract_component,X,component = 'genpt')) else : if issignal: y = np.ones(len(X),dtype = int) else : y = np.zeros(len(X),dtype = int) print('### kt ###') X_ = np.copy(X) X_ = multithreadmap(preprocess,X_,output = 'kt',regression = regression,cluster = cluster,R_clustering = R_clustering) X_ = multithreadmap(rewrite_content,X_) X_ = multithreadmap(permute_by_pt,X_) X_ = multithreadmap(extract,X_) np.save(tosavefilename+"kt.npy", np.array([X_, y])) print('### cambridge ###') X_ = np.copy(X) X_ = multithreadmap(preprocess,X_,output = 'cambridge',regression = regression,cluster = cluster,R_clustering = R_clustering) X_ = multithreadmap(rewrite_content,X_) X_ = multithreadmap(permute_by_pt,X_) X_ = multithreadmap(extract,X_) np.save(tosavefilename+"cambridge.npy", np.array([X_, y])) X = multithreadmap(preprocess,X,output = "anti-kt",regression = regression,cluster = cluster,R_clustering = R_clustering) print('### anti-kt ###') X_ = np.copy(X) X_ = multithreadmap(rewrite_content,X_) X_ = multithreadmap(permute_by_pt,X_) X_ = multithreadmap(extract,X_) np.save(tosavefilename+"anti-kt.npy",np.array([X_, y])) print('### random tree ###') X_ = np.copy(X) X_ = multithreadmap(randomize,X_) X_ = multithreadmap(rewrite_content,X_) X_ = multithreadmap(permute_by_pt,X_) X_ = multithreadmap(extract,X_) np.save(tosavefilename+"random.npy", np.array([X_, y])) print('### seq by pt ###') X_ = np.copy(X) X_ = multithreadmap(sequentialize_by_pt,X_,reverse = False) X_ = multithreadmap(rewrite_content,X_) X_ = multithreadmap(permute_by_pt,X_) X_ = multithreadmap(extract,X_) np.save(tosavefilename+"seqpt.npy", np.array([X_, y])) print('### seq by pt reversed ###') X_ = np.copy(X) X_ = multithreadmap(sequentialize_by_pt,X_,reverse = True) X_ = multithreadmap(rewrite_content,X_) X_ = multithreadmap(permute_by_pt,X_) X_ = multithreadmap(extract,X_) np.save(tosavefilename+"seqpt_reversed.npy", np.array([X_, y])) return(None)
#'QCD_Pt120to170', #'QCD_Pt50to80', #'QCD_Pt170to300_ext', #'QCD_Pt120to170_ext'] #def app(txt): # return('/'+txt+'_dataformat.npy') #signallist = multithreadmap(app,signallist) #backgroundlist = multithreadmap(app,backgroundlist) background = [] for path_file in backgroundlist: events = np.array(np.load(basepath+path_file)) background = background + multithreadmap(ff, events,cluster=cluster,R=1.0) signal = [] for path_file in signallist: events = np.array(np.load(basepath+path_file)) signal = signal + multithreadmap(ff, events, cluster=cluster,R=1.0) # In[]: ### creating files to be preprocessed ### nmax = min(len(signal),len(background)) if nmax%2==1: nmax -= 1 X = np.array(background[:nmax]+signal[:nmax]) y = np.array([0]*nmax+[1]*nmax)
content = np.array(content).reshape(-1, 5) jets.append((tree, content, mass, pt)) return jets # In[]: for t in ['train','test']: ### Loading and "jetting" data with ff ### signallist = ['/Background_JEC_'+t+'_ID.npy'] signal = [] for path_file in signallist: events = np.array(np.load(basepath+path_file)) signal = signal + multithreadmap(ff,events,cluster=cluster,regression=True,R=1000.) ## In[]: ### creating files to be preprocessed ### print(len(signal)) X = np.array(signal) y = np.array(multithreadmap(extract_component,X,component='genpt')) for R_clustering,f in [(0.3,basepath+'/npyfilesregression/subjet_oriented_'), ( 0.000001,basepath+'/npyfilesregression/particle_oriented_')]: if t='train' ## In[]: ### eliminate single particles ### i=0 while i < (len(y)): if X[i]['tree'].shape == (1, 2): X,y=np.delete(X,i),np.delete(y,i)