def build_model(**parameters): sequence_len = parameters['sequence_len'] dropout_01 = parameters['dropout_01'] dropout_02 = parameters['dropout_02'] dev_df, test_df, val_df = build_test_validation_df() if VERBOSE: (dev_df.head()) train_x, train_y = process_sb_df(dev_df, sequence_len) test_x, test_y = process_sb_df(test_df, sequence_len) print("train sells: ", train_y.count(0), ", holds: ", train_y.count(1), ", buys: ", train_y.count(2)) print("test sells: ", test_y.count(0), ", holds:", test_y.count(1), ", buys: ", test_y.count(2)) print("sequence_len: ", sequence_len, " ,dropout_01: ", dropout_01, " ,dropout_02: ", dropout_02) input_shape = (train_x.shape[1:]) # Train model from keras.wrappers.scikit_learn import KerasClassifier model = KerasClassifier(build_fn=create_model, input_shape=input_shape, dropout_01=dropout_01, dropout_02=dropout_02, epochs=10, batch_size=128, verbose=1) score = -999.0 if RUN_WITH_BEST_PARAMETERS: model.fit(train_x, train_y) model.save(MODEL_PATH) else: from sklearn.model_selection import cross_val_score score = np.mean(cross_val_score(model, train_x, train_y, cv=3)) print("average score: ", score) return -score
grid_result = grid.fit(base_model, outputs) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) for params, mean_score, scores in grid_result.grid_scores_: print("%f (%f) with: %r" % (scores.mean(), scores.std(), params)) early_stopping = EarlyStopping(patience=20) checkpointer = ModelCheckpoint('inception_resnet_bottleneck_drug_best.h5', verbose=1, save_best_only=True) ImageFile.LOAD_TRUNCATED_IMAGES = True model.fit_generator(batches, steps_per_epoch=num_train_steps, epochs=1000, callbacks=[early_stopping, checkpointer], validation_data=val_batches, validation_steps=num_valid_steps) model.save_weights('inception_resnet_bottleneck_drug_weights.h5') model.save('inception_resnet_bottleneck_drug.h5') # for layer in model.layers[-31:]: # layer.trainable=True # for layer in model.layers[:-31]: # layer.trainable=False # model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9), metrics=['accuracy']) # checkpointer = ModelCheckpoint('./resnet50_best_safety.h5', verbose=1, save_best_only=True) # model.fit_generator(batches, steps_per_epoch=num_train_steps, epochs=1000, callbacks=[early_stopping, checkpointer], validation_data=val_batches, validation_steps=num_valid_steps) # model.save('resnet50_safety.h5')
# print(model.summary()) labelencoder = LabelEncoder() integer_encoded = labelencoder.fit_transform(data['v1']) y = to_categorical(integer_encoded) X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42) print(X_train.shape, Y_train.shape) print(X_test.shape, Y_test.shape) batch_size = 32 model = createmodel() model.fit(X_train, Y_train, epochs=5, batch_size=batch_size, verbose=2) model.save('spam_model.h5') score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size) print(score) print(acc) import pandas as pd import keras from sklearn.feature_extraction.text import CountVectorizer from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D from sklearn.model_selection import train_test_split from keras.utils.np_utils import to_categorical from keras.wrappers.scikit_learn import KerasClassifier
classifier = KerasClassifier(build_fn=build_classifier) parameters = { 'batch_size': [40, 80], 'epochs': [300, 600], 'optimizer': ['adam', 'rmsprop'], 'hidden_neurons': [9, 10] } grid = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10) grid_search = grid.fit(X_train, y_train) best_parameters = grid_search.best_params_ classifier = build_classifier(best_parameters['optimizer'], best_parameters['hidden_neurons']) classifier.fit(X_train, y_train, epochs=best_parameters['epochs'], batch_size=best_parameters['batch_size']) classifier.save("classifier.h5") print("Saved model to disk") print("Best parameters: {}".format(best_parameters)) best_accuracy = grid_search.best_score_ print("Best accuracy: {}".format(best_accuracy)) # Can load model with model = load_model(filename)
# summarize results #print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) # create model model = build_model(x_train.shape[1:], y_train.shape[-1], activation[0], learn_rate[0], loss[0], optimizer[0], drop_rate[0]) print('built model..') # Data Augmentation datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True, rotation_range=0.0, fill_mode='nearest', horizontal_flip=True, vertical_flip=True, rescale=1. / 255, preprocessing_function=None, validation_split=0.25) datagen.fit(x_train) model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size[0]), epochs=epochs[0]) # Save model with weights model.save(data_dir + '/models/yelp_model.h5') score = model.evaluate(x_test, y_test) print('Model loss:', score) #print('Model accuracy:', score)
nb_epoch = runEpoch) if(os.access(modelName, os.F_OK)): classifier=load_model(modelName) classifier.fit(X_train, y_train, batch_size=BS, epochs=runEpoch, class_weight=class_weights, validation_data=(X_test, y_test), verbose=2) y_predict=classifier.predict(X_test,batch_size=BS) y_predict = [j[0] for j in y_predict] y_predict = np.where(np.array(y_predict)<0.5,0,1) precision = precision_score(y_test, y_predict, average='macro') recall = recall_score(y_test,y_predict, average='macro') print ("Precision:", precision) print ("Recall:", recall) confusion_matrix=confusion_matrix(y_test,y_predict) print confusion_matrix precision_p = float(confusion_matrix[1][1])/float((confusion_matrix[0][1] + confusion_matrix[1][1])) recall_p = float(confusion_matrix[1][1])/float((confusion_matrix[1][0] + confusion_matrix[1][1])) if(os.access(modelName, os.F_OK)): print(classifier.summary()) classifier.save(modelName) else: print(classifier.model.summary()) classifier.model.save(modelName)
model.add(layers.Dense(512, activation='relu')) model.add(layers.Dense(3, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer=optimizers.RMSprop(lr=1e-4), metrics=['acc']) history = model.fit(X_train, y_train, epochs=35, batch_size=150, validation_data=(X_val, y_val)) model.save('best_model_imag_3clases.h5') Y_pred_proba=model.predict(X_test) Y_pred=model.predict_classes(X_test) test_loss_trained_net, test_acc_trained_net = model.evaluate(X_test, y_test) print('test_acc:', test_acc_trained_net) #Resultado 75% de acuraccy sobre el test matrix = confusion_matrix(y_test.argmax(axis=1), Y_pred) labels_grouped=np.array(["Musica popular", "Musica melodica", "Musica ritmica"]) plot_confusion_matrix(y_test.argmax(axis=1), Y_pred, classes=labels_grouped, title='Confusion matrix, without normalization') #Graficamos el Loss (error) en función de los Epochs
classifier = Sequential() classifier.add(Embedding(max_features, output_dim=256)) classifier.add(LSTM(128)) classifier.add(Dropout(0.3)) classifier.add(Dense(1, activation='sigmoid')) classifier.compile(loss='binary_crossentropy', optimizer='Adam', #'rmsprop', metrics=['accuracy']) return classifier #Now we should create classifier object using our internal classifier object in the function above classifier = KerasClassifier(build_fn= classifier_builder, batch_size = 1024, nb_epoch = runEpoch) #10) if(os.access("lstm_model.h5", os.F_OK)): classifier=load_model('lstm_model.h5') hist=classifier.fit(X_train, y_train, batch_size=1024, epochs=runEpoch) print(hist.history) if(os.access("lstm_model.h5", os.F_OK)): print(classifier.summary()) classifier.save('lstm_model.h5') else: print(classifier.model.summary()) classifier.model.save('lstm_model.h5')
print(valid_x_data.shape, valid_y_data.shape) if args.tunning: model = KerasClassifier(build_fn=tunnel_model, input_shape=(50, maxlen), clear_session=True) parameter_tunning(model, train_x_data, train_y_data) sys.exit(0) model = build_model_graph(input_shape=(charmap_size, maxlen), model='lstm_model_endgame') #checkpointer = ModelCheckpoint(filepath='/tmp/weights.model', # verbose=1, monitor='val_acc', # save_best_only=True) train_model(model, train_x_data, train_y_data, validation_data=(valid_x_data, valid_y_data), batch_size=256, epochs=30, with_weights=False) #, checkpointer=checkpointer) if args.save_model: model.save(args.save_model) test_x_data, test_y_data = pickle.load(open('test_data.pkl', 'rb')) # test_x_data, test_y_data = pickle.load(open('train_data.pkl', 'rb')) test_binary_model(model, test_x_data, test_y_data, threshold)
data = pd.read_csv("KDDTrain+.txt") data.drop(data.columns[[ 0, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 32, 34, 39, 40, 42 ]], axis=1, inplace=True) le = LabelEncoder() data['protocol_type'] = le.fit_transform(data['protocol_type']) data['service'] = le.fit_transform(data['service']) data['flag'] = le.fit_transform(data['flag']) data['a_class'] = le.fit_transform(data['a_class']) values = data.iloc[:, 0:20].values labels = data.iloc[:, 20].values scaler = MinMaxScaler(feature_range=(0, 1)) rescaledValues = scaler.fit_transform(values) model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=1) kfold = StratifiedKFold(n_splits=10, random_state=seed) results = cross_val_score(model, rescaledValues, labels, cv=kfold) model.save('my_model.h5') print(results.mean())
nb_epoch=runEpoch) if (os.access("lstm_reshape_5_256.md", os.F_OK)): classifier = load_model('lstm_reshape_5_256.md') classifier.fit(X_train, y_train, batch_size=BS, epochs=runEpoch, class_weight=class_weights, validation_data=(X_test, y_test)) y_predict = classifier.predict(X_test, batch_size=BS) y_predict = [j[0] for j in y_predict] y_predict = np.where(np.array(y_predict) < 0.5, 0, 1) precision = precision_score(y_test, y_predict, average='macro') recall = recall_score(y_test, y_predict, average='macro') print("Precision:", precision) print("Recall:", recall) confusion_matrix = confusion_matrix(y_test, y_predict) print confusion_matrix if (os.access("lstm_reshape_5.md", os.F_OK)): print(classifier.summary()) classifier.save('lstm_reshape_5_256.md') else: print(classifier.model.summary()) classifier.model.save('lstm_reshape_5_256.md')
file_name_base="lstm_model.h5" #num to string str_runLoop='%d' %runLoop str_batch_size='%d' %batch_size file_name=str_runLoop+"_"+str_batch_size+"_"+file_name_base if(os.access(file_name, os.F_OK)): classifier=load_model(file_name) for i in range(0, runLoop): hist=classifier.fit(X_train, y_train, batch_size=batch_size, epochs=runEpoch, class_weight=class_weights, validation_data=(X_test, y_test)) print "loop i=", i, "hist:", hist.history y_predict=classifier.predict(X_test,batch_size=batch_size) y_predict = [j[0] for j in y_predict] y_predict = np.where(np.array(y_predict)<0.5,0,1) precision = precision_score(y_test, y_predict, average='macro') recall = recall_score(y_test,y_predict, average='macro') print ("Precision:", precision) print ("Recall:", recall) if(os.access(file_name, os.F_OK)): print(classifier.summary()) classifier.save(file_name) else: print(classifier.model.summary()) classifier.model.save(file_name) print("End----------") print()
model.add(layers.Dense(128, activation='relu', )) model.add(layers.Dense(64, activation='relu',)) model.add(layers.Dense(10, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc']) model.summary() history = model.fit(X_train, y_train, epochs=35, batch_size=120, validation_data=(X_val, y_val)) model.save('best_model_feat_10classes.h5') Y_pred_proba=model.predict(X_test) Y_pred=model.predict_classes(X_test) test_loss_trained_net, test_acc_trained_net = model.evaluate(X_test, y_test) print('test_acc:', test_acc_trained_net) #Resultado 69% de acuraccy sobre el test matrix = confusion_matrix(y_test.argmax(axis=1), Y_pred) labels=np.array(clases) plot_confusion_matrix(y_test.argmax(axis=1), Y_pred, classes=labels, title='Confusion matrix, without normalization') #Graficamos el Loss (error) en función de los Epochs
# encoder = Model(input = input_dim, output = encoded) # encoded_input = Input(shape = (encoding_dim, )) # X_test_encoded = encoder.predict(sX) # encoder.save('my_encoder.h5') #PCA print 'PCA start' svd = TruncatedSVD(n_components=500, n_iter=7, random_state=42) svd.fit(X_train_tfidf) X_train_tfidf = svd.transform(X_train_tfidf) print 'PCA done' # split data into training and testing # from sklearn.cross_validation import train_test_split # from sklearn.model_selection import train_test_split # X_train , X_test , y_train ,y_test= train_test_split(X_train_tfidf,y,test_size=0.5) estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0) kfold = KFold(n_splits=10, shuffle=True, random_state=seed) results = cross_val_score(estimator, X_train_tfidf, y, cv=kfold) print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) estimator.save('my_dnn_model.h5')
kernel_regularizer=regularizers.l2(0.001))(L) print('Dense layer is:', L) model = Model(inputs=sequence_input, outputs=L) # Optimization and compile opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01) print('Begin compiling...') model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) model.summary() # Begin training model.fit(data_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=2, validation_data=(data_val, Y_val)) score = model.evaluate(data_test, Y_test, batch_size=batch_size) print ('The evaluation is: ', score) # Evaluate testing set test_accuracy = grid.score(X_test, y_test) # Save model print ('Saving model...') model.save('CNN-LSTM-Turkish corpus-200d')
validation_data=(X_test, y_test), verbose=2) bk.set_learning_phase(0) #测试阶段 print "After set ", bk.learning_phase() y_predict = classifier.predict(X_test, batch_size=BS) y_predict = [j[0] for j in y_predict] y_predict = np.where(np.array(y_predict) < 0.5, 0, 1) confusion_matrix = confusion_matrix(y_test, y_predict) print confusion_matrix precision_p = float(confusion_matrix[1][1]) / float( (confusion_matrix[0][1] + confusion_matrix[1][1])) recall_p = float(confusion_matrix[1][1]) / float( (confusion_matrix[1][0] + confusion_matrix[1][1])) print("Precision:", precision_p) print("Recall:", recall_p) if (os.access("lstm_lxr.md", os.F_OK)): print(classifier.summary()) classifier.save('lstm_lxr.md') else: print(classifier.model.summary()) classifier.model.save('lstm_lxr.md') #('Precision:', 0.9660511363636364) #('Recall:', 0.9601863617111394)
def main(): print('Using Keras version: ', keras.__version__) usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument('-t', '--train_model', dest='train_model', help='Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int) parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str) parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int) parser.add_argument('-i', '--inputs_file_path', dest='inputs_file_path', help='Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str) args = parser.parse_args() do_model_fit = args.train_model suffix = args.suffix # Create instance of the input files directory inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/' hyp_param_scan=args.hyp_param_scan # Set model hyper-parameters weights='BalanceYields'# 'BalanceYields' or 'BalanceNonWeighted' optimizer = 'Nadam' validation_split=0.1 # hyper-parameter scan results if weights == 'BalanceNonWeighted': learn_rate = 0.0005 epochs = 200 batch_size=200 if weights == 'BalanceYields': learn_rate = 0.0001 epochs = 200 batch_size=400 # Create instance of output directory where all results are saved. output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix,weights) check_dir(output_directory) hyperparam_file = os.path.join(output_directory,'additional_model_hyper_params.txt') additional_hyperparams = open(hyperparam_file,'w') additional_hyperparams.write("optimizer: "+optimizer+"\n") additional_hyperparams.write("learn_rate: "+str(learn_rate)+"\n") additional_hyperparams.write("epochs: "+str(epochs)+"\n") additional_hyperparams.write("validation_split: "+str(validation_split)+"\n") additional_hyperparams.write("weights: "+weights+"\n") # Create plots subdirectory plots_dir = os.path.join(output_directory,'plots/') input_var_jsonFile = open('input_variables.json','r') selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)' # selection_criteria = '(AtLeast4GoodJets0Lep==1)' # selection_criteria = '(passPhotonSels==1 && passbVeto==1 && ExOneLep==1 && goodJets==1)' #selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)' # Load Variables from .json variable_list = json.load(input_var_jsonFile,encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key,var in variable_list: column_headers.append(key) column_headers.append('weight') column_headers.append('unweighted') column_headers.append('target') column_headers.append('key') column_headers.append('classweight') column_headers.append('process_ID') # Create instance of the input files directory #inputs_file_path = '/afs/cern.ch/work/a/atishelm/public/ForJosh/2017_DataMC_ntuples_moreVars' inputs_file_path = '/eos/user/r/rasharma/post_doc_ihep/double-higgs/ntuples/September29/MVANtuples' #inputs_file_path = '/eos/user/a/atishelm/ntuples/HHWWgg_DataSignalMCnTuples/PromptPromptApplied/' #inputs_file_path = 'PromptPromptApplied/' # Load ttree into .csv including all variables listed in column_headers print('<train-DNN> Input file path: ', inputs_file_path) outputdataframe_name = '%s/output_dataframe.csv' %(output_directory) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)) else: print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)) data = load_data(inputs_file_path,column_headers,selection_criteria) # Change sentinal value to speed up training. data = data.replace(to_replace=-999.000000,value=-9.0) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) print('<main> data columns: ', (data.columns.values.tolist())) n = len(data) nHH = len(data.iloc[data.target.values == 1]) nbckg = len(data.iloc[data.target.values == 0]) print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg)) # Make instance of plotter tool Plotter = plotter() # Create statistically independant training/testing data traindataset, valdataset = train_test_split(data, test_size=0.1) valdataset.to_csv((output_directory+'valid_dataset.csv'), index=False) print('<train-DNN> Training dataset shape: ', traindataset.shape) print('<train-DNN> Validation dataset shape: ', valdataset.shape) # Event weights weights_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'weight'] weights_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'weight'] weights_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'weight'] weights_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'weight'] weights_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'weight'] weights_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'weight'] weights_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'weight'] weights_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'weight'] weights_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'weight'] HHsum_weighted= sum(weights_for_HH) GJetsum_weighted= sum(weights_for_GJet) DiPhotonsum_weighted= sum(weights_for_DiPhoton) TTGGsum_weighted= sum(weights_for_TTGG) TTGJetssum_weighted= sum(weights_for_TTGJets) TTJetssum_weighted= sum(weights_for_TTJets) WJetssum_weighted= sum(weights_for_WJets) ttHsum_weighted= sum(weights_for_ttH) DYsum_weighted= sum(weights_for_DY) #bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted+ttHsum_weighted bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted nevents_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'unweighted'] nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'unweighted'] nevents_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'unweighted'] nevents_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'unweighted'] nevents_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'unweighted'] nevents_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'unweighted'] nevents_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'unweighted'] nevents_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'unweighted'] nevents_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'unweighted'] HHsum_unweighted= sum(nevents_for_HH) GJetsum_unweighted= sum(nevents_for_GJet) DiPhotonsum_unweighted= sum(nevents_for_DiPhoton) TTGGsum_unweighted= sum(nevents_for_TTGG) TTGJetssum_unweighted= sum(nevents_for_TTGJets) TTJetssum_unweighted= sum(nevents_for_TTJets) WJetssum_unweighted= sum(nevents_for_WJets) ttHsum_unweighted= sum(nevents_for_ttH) DYsum_unweighted= sum(nevents_for_DY) #bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted+ttHsum_unweighted bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted if weights=='BalanceYields': print('HHsum_weighted= ' , HHsum_weighted) print('ttHsum_weighted= ' , ttHsum_weighted) print('DiPhotonsum_weighted= ', DiPhotonsum_weighted) print('WJetssum_weighted= ', WJetssum_weighted) print('DYsum_weighted= ', DYsum_weighted) print('GJetsum_weighted= ', GJetsum_weighted) print('bckgsum_weighted= ', bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_weighted/bckgsum_weighted) if weights=='BalanceNonWeighted': print('HHsum_unweighted= ' , HHsum_unweighted) print('ttHsum_unweighted= ' , ttHsum_unweighted) print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted) print('WJetssum_unweighted= ', WJetssum_unweighted) print('DYsum_unweighted= ', DYsum_unweighted) print('GJetsum_unweighted= ', GJetsum_unweighted) print('bckgsum_unweighted= ', bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) # Remove column headers that aren't input variables training_columns = column_headers[:-6] print('<train-DNN> Training features: ', training_columns) column_order_txt = '%s/column_order.txt' %(output_directory) column_order_file = open(column_order_txt, "wb") for tc_i in training_columns: line = tc_i+"\n" pickle.dump(str(line), column_order_file) num_variables = len(training_columns) # Extract training and testing data X_train = traindataset[training_columns].values X_test = valdataset[training_columns].values # Extract labels data Y_train = traindataset['target'].values Y_test = valdataset['target'].values # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.png' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) #################################################################################### # Weights applied during training. You will also need to update the class weights if # you are going to change the event weights applied. Introduce class weights and any # event weight you want to use here. #trainingweights = traindataset.loc[:,'classbalance']#*traindataset.loc[:,'weight'] #trainingweights = np.array(trainingweights) # Temp hack to be able to change class weights without remaking dataframe #for inde in xrange(len(trainingweights)): # newweight = 13243.0/6306.0 # trainingweights[inde]= newweight #print 'training event weight = ', trainingweights[0] # Event weights calculation so we can correctly apply event weights to diagnostic plots. # use seperate list because we don't want to apply class weights in plots. # Event weights if wanted train_weights = traindataset['weight'].values test_weights = valdataset['weight'].values # Weights applied during training. if weights=='BalanceYields': trainingweights = traindataset.loc[:,'classweight']*traindataset.loc[:,'weight'] if weights=='BalanceNonWeighted': trainingweights = traindataset.loc[:,'classweight'] trainingweights = np.array(trainingweights) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.pdf' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) if do_model_fit == 1: print('<train-BinaryDNN> Training new model . . . . ') histories = [] labels = [] if hyp_param_scan == 1: print('Begin at local time: ', time.localtime()) hyp_param_scan_name = 'hyp_param_scan_results.txt' hyp_param_scan_results = open(hyp_param_scan_name,'a') time_str = str(time.localtime())+'\n' hyp_param_scan_results.write(time_str) hyp_param_scan_results.write(weights) learn_rates=[0.00001, 0.0001] epochs = [150,200] batch_size = [400,500] param_grid = dict(learn_rate=learn_rates,epochs=epochs,batch_size=batch_size) model = KerasClassifier(build_fn=gscv_model,verbose=0) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) grid_result = grid.fit(X_train,Y_train,shuffle=True,sample_weight=trainingweights) print("Best score: %f , best params: %s" % (grid_result.best_score_,grid_result.best_params_)) hyp_param_scan_results.write("Best score: %f , best params: %s\n" %(grid_result.best_score_,grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean,stdev,param)) hyp_param_scan_results.write("Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean,stdev,param)) exit() else: # Define model for analysis early_stopping_monitor = EarlyStopping(patience=30, monitor='val_loss', verbose=1) model = baseline_model(num_variables, learn_rate=learn_rate) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epoch = One pass over data (useful for periodic logging and evaluation) #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train)) history = model.fit(X_train,Y_train,validation_split=validation_split,epochs=epochs,batch_size=batch_size,verbose=1,shuffle=True,sample_weight=trainingweights,callbacks=[early_stopping_monitor]) histories.append(history) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch.png' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename) else: model_name = os.path.join(output_directory,'model.h5') model = load_trained_model(model_name) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory,'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory,'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory,'model_serialised.json') with open(model_json_name,'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory,'model_schematic.eps') print "DEBUG: ",model_schematic_name plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) # plot_model(model, to_file='model_schematic.eps', show_shapes=True, show_layer_names=True) # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory ''' print('================') print('Training event labels: ', len(Y_train)) print('Training event probs', len(result_probs)) print('Training event weights: ', len(train_weights)) print('Testing events: ', len(Y_test)) print('Testing event probs', len(result_probs_test)) print('Testing event weights: ', len(test_weights)) print('================') ''' # Make overfitting plots of output nodes Plotter.binary_overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights) print "DEBUG: Y_train shape: ",Y_train.shape # # Get true process integers for training dataset # original_encoded_train_Y = [] # for i in xrange(len(result_probs)): # if Y_train[i][0] == 1: # original_encoded_train_Y.append(0) # if Y_train[i][1] == 1: # original_encoded_train_Y.append(1) # if Y_train[i][2] == 1: # original_encoded_train_Y.append(2) # if Y_train[i][3] == 1: # original_encoded_train_Y.append(3) # Get true class values for testing dataset # result_classes_test = newencoder.inverse_transform(result_classes_test) # result_classes_train = newencoder.inverse_transform(result_classes) e = shap.DeepExplainer(model, X_train[:400, ]) shap_values = e.shap_values(X_test[:400, ]) Plotter.plot_dot(title="DeepExplainer_sigmoid_y0", x=X_test[:400, ], shap_values=shap_values, column_headers=column_headers) Plotter.plot_dot_bar(title="DeepExplainer_Bar_sigmoid_y0", x=X_test[:400,], shap_values=shap_values, column_headers=column_headers) #e = shap.GradientExplainer(model, X_train[:100, ]) #shap_values = e.shap_values(X_test[:100, ]) #Plotter.plot_dot(title="GradientExplainer_sigmoid_y0", x=X_test[:100, ], shap_values=shap_values, column_headers=column_headers) #e = shap.KernelExplainer(model.predict, X_train[:100, ]) #shap_values = e.shap_values(X_test[:100, ]) #Plotter.plot_dot(title="KernelExplainer_sigmoid_y0", x=X_test[:100, ],shap_values=shap_values, column_headers=column_headers) #Plotter.plot_dot_bar(title="KernelExplainer_Bar_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers) #Plotter.plot_dot_bar_all(title="KernelExplainer_bar_All_Var_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers) # Create confusion matrices for training and testing performance # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png') # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'columns') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'columns') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TEST.png') # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'') # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'') # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png') Plotter.ROC_sklearn(Y_train, result_probs, Y_test, result_probs_test, 1 , 'BinaryClassifierROC',train_weights, test_weights)
'optimizer': ['rmsprop'], 'neurons': [5, 6, 7], 'n_layer': [3] } grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10) grid_search = grid_search.fit(X_train, y_train) best_parameters = grid_search.best_params_ best_accuracy = grid_search.best_score_ # evaluation (k-fold crossvalidation included in grid search) #classifier = KerasClassifier(build_fn = make_my_classifier, batch_size = 32, epochs = 250) #accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) #mean = accuracies.mean() #variance = accuracies.std() #------------------------------------------------------------------------------ # Predictions #------------------------------------------------------------------------------ prediction = classifier.predict(X_test) prediction = (prediction > 0.5) cm = confusion_matrix(y_test, prediction) #------------------------------------------------------------------------------ # Save and/or load model #------------------------------------------------------------------------------ classifier.save('wine_good_or_nah.h5', overwrite=True) km.load_model('wine_good_or_nah.h5')
ann_classifier.add( Dense( units=1, kernel_initializer="uniform", # output layer activation="sigmoid")) optimizer = Adam(lr, decay) ann_classifier.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["acc"]) return ann_classifier ann_classifier = KerasClassifier(build_fn=create_classifier) # visualization of the model print(ann_classifier.summary()) plot_model(ann_classifier, to_file='ann_classifier_plot.png', show_shapes=True, show_layer_names=True) kfold = KFold(n_splits=10, shuffle=True, random_state=seed) grid = GridSearchCV(estimator=ann_classifier, param_grid=param_grid, scoring="accuracy", cv=kfold, verbose=1) grid_results = grid.fit(X=x_train_scaled, y=y_train) ann_classifier.save("ann_adam.h5")
param_grid=parameters, scoring="accuracy", cv=10, n_jobs=-1) grid_search = grid_search.fit(X_train, y_train) ## Getting best parameters from the GridSearchCV best_parameters = grid_search.best_params_ best_accuracy = grid_search.best_score_ ## Building and Fitting the ANN with bests parameters classifier = build_classifier(optimizer=best_parameters.get("optimizer"), nb_layers=best_parameters.get("nb_layers"), dropout=best_parameters.get("dropout")) classifier.fit(X_train, y_train, batch_size=best_parameters.get("batch_size"), epochs=best_parameters.get("epochs")) ## Saving the model for reuse classifier.save("churn.h5") ## Making the predictions and evaluating the model y_pred = classifier.predict(X_test) y_pred = (y_pred > .5) cm = confusion_matrix(y_test, y_pred) accuracy = (cm[0, 0] + cm[1, 1]) / X_test.shape[0]
y_pred = classifier.predict(X_test) y_pred = (y_pred > 0.5) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) ## Accuracy of 86,5% confirmed #### SAVING/LOADING MODEL: classifier.save('my_classifier.h5') #classifier = load_model('my_classifier.h5') ### SINGLE PREDICTION # two pairs of square brackets + feature scaling(NO FIT! - just transform) # To remove warning transform one element into float #john = np.array([[0.0,1,555,1, 51,5, 1550000, 5, 1, 1, 120000]]) #john_transform = sc.transform(john) # #
kernel_regularizer=regularizers.l2(0.001))(L) print('Dense layer is:', L) model = Model(inputs=sequence_input, outputs=L) # Optimization and compile opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01) print('Begin compiling...') model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) model.summary() # Begin training model.fit(data_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=2, validation_data=(data_val, Y_val)) score = model.evaluate(data_test, Y_test, batch_size=batch_size) print ('The evaluation is: ', score) # Evaluate testing set test_accuracy = grid.score(X_test, y_test) # Save model print ('Saving model...') model.save('CNN-GRU-Turkish corpus-200d')
def main(): print('Using Keras version: ', keras.__version__) usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-t', '--train_model', dest='train_model', help= 'Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int) parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str) parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int) parser.add_argument( '-i', '--inputs_file_path', dest='inputs_file_path', help= 'Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str) args = parser.parse_args() do_model_fit = args.train_model suffix = args.suffix # Create instance of the input files directory #inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/' inputs_file_path = '/eos/user/b/bmarzocc/HHWWgg/January_2021_Production/2017/' hyp_param_scan = args.hyp_param_scan # Set model hyper-parameters weights = 'BalanceYields' # 'BalanceYields' or 'BalanceNonWeighted' optimizer = 'Nadam' validation_split = 0.1 # hyper-parameter scan results if weights == 'BalanceNonWeighted': learn_rate = 0.0005 epochs = 200 batch_size = 200 if weights == 'BalanceYields': learn_rate = 0.0001 epochs = 200 batch_size = 32 #epochs = 10 #batch_size=200 # Create instance of output directory where all results are saved. output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix, weights) check_dir(output_directory) hyperparam_file = os.path.join(output_directory, 'additional_model_hyper_params.txt') additional_hyperparams = open(hyperparam_file, 'w') additional_hyperparams.write("optimizer: " + optimizer + "\n") additional_hyperparams.write("learn_rate: " + str(learn_rate) + "\n") additional_hyperparams.write("epochs: " + str(epochs) + "\n") additional_hyperparams.write("validation_split: " + str(validation_split) + "\n") additional_hyperparams.write("weights: " + weights + "\n") # Create plots subdirectory plots_dir = os.path.join(output_directory, 'plots/') input_var_jsonFile = open('input_variables.json', 'r') selection_criteria = '( (Leading_Photon_pt/CMS_hgg_mass) > 1/3 && (Subleading_Photon_pt/CMS_hgg_mass) > 1/4 )' # Load Variables from .json variable_list = json.load(input_var_jsonFile, encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key, var in variable_list: column_headers.append(key) column_headers.append('weight') column_headers.append('unweighted') column_headers.append('target') column_headers.append('key') column_headers.append('classweight') column_headers.append('process_ID') # Load ttree into .csv including all variables listed in column_headers print('<train-DNN> Input file path: ', inputs_file_path) outputdataframe_name = '%s/output_dataframe.csv' % (output_directory) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)) else: print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)) data = load_data(inputs_file_path, column_headers, selection_criteria) # Change sentinal value to speed up training. data = data.mask(data < -25., -9.) #data = data.replace(to_replace=-99.,value=-9.0) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) print('<main> data columns: ', (data.columns.values.tolist())) n = len(data) nHH = len(data.iloc[data.target.values == 1]) nbckg = len(data.iloc[data.target.values == 0]) print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg)) # Make instance of plotter tool Plotter = plotter() # Create statistically independant training/testing data traindataset, valdataset = train_test_split(data, test_size=0.1) valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False) print('<train-DNN> Training dataset shape: ', traindataset.shape) print('<train-DNN> Validation dataset shape: ', valdataset.shape) # Event weights weights_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH', 'weight'] weights_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg', 'weight'] weights_for_DiPhoton = traindataset.loc[traindataset['process_ID'] == 'DiPhoton', 'weight'] weights_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet', 'weight'] weights_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD', 'weight'] weights_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY', 'weight'] weights_for_TTGsJets = traindataset.loc[traindataset['process_ID'] == 'TTGsJets', 'weight'] weights_for_WGsJets = traindataset.loc[traindataset['process_ID'] == 'WGsJets', 'weight'] weights_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW', 'weight'] HHsum_weighted = sum(weights_for_HH) Hggsum_weighted = sum(weights_for_Hgg) DiPhotonsum_weighted = sum(weights_for_DiPhoton) GJetsum_weighted = sum(weights_for_GJet) QCDsum_weighted = sum(weights_for_QCD) DYsum_weighted = sum(weights_for_DY) TTGsJetssum_weighted = sum(weights_for_TTGsJets) WGsJetssum_weighted = sum(weights_for_WGsJets) WWsum_weighted = sum(weights_for_WW) bckgsum_weighted = Hggsum_weighted + DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted #bckgsum_weighted = DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted nevents_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH', 'unweighted'] nevents_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg', 'unweighted'] nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID'] == 'DiPhoton', 'unweighted'] nevents_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet', 'unweighted'] nevents_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD', 'unweighted'] nevents_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY', 'unweighted'] nevents_for_TTGsJets = traindataset.loc[traindataset['process_ID'] == 'TTGsJets', 'unweighted'] nevents_for_WGsJets = traindataset.loc[traindataset['process_ID'] == 'WGsJets', 'unweighted'] nevents_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW', 'unweighted'] HHsum_unweighted = sum(nevents_for_HH) Hggsum_unweighted = sum(nevents_for_Hgg) DiPhotonsum_unweighted = sum(nevents_for_DiPhoton) GJetsum_unweighted = sum(nevents_for_GJet) QCDsum_unweighted = sum(nevents_for_QCD) DYsum_unweighted = sum(nevents_for_DY) TTGsJetssum_unweighted = sum(nevents_for_TTGsJets) WGsJetssum_unweighted = sum(nevents_for_WGsJets) WWsum_unweighted = sum(nevents_for_WW) bckgsum_unweighted = Hggsum_unweighted + DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted #bckgsum_unweighted = DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted HHsum_weighted = 2 * HHsum_weighted HHsum_unweighted = 2 * HHsum_unweighted if weights == 'BalanceYields': print('HHsum_weighted= ', HHsum_weighted) print('Hggsum_weighted= ', Hggsum_weighted) print('DiPhotonsum_weighted= ', DiPhotonsum_weighted) print('GJetsum_weighted= ', GJetsum_weighted) print('QCDsum_weighted= ', QCDsum_weighted) print('DYsum_weighted= ', DYsum_weighted) print('TTGsJetssum_weighted= ', TTGsJetssum_weighted) print('WGsJetssum_weighted= ', WGsJetssum_weighted) print('WWsum_weighted= ', WWsum_weighted) print('bckgsum_weighted= ', bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'HH', ['classweight']] = HHsum_unweighted / HHsum_weighted traindataset.loc[traindataset['process_ID'] == 'Hgg', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'DiPhoton', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'GJet', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'QCD', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'DY', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'TTGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'WGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'WW', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) if weights == 'BalanceNonWeighted': print('HHsum_unweighted= ', HHsum_unweighted) print('Hggsum_unweighted= ', Hggsum_unweighted) print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted) print('GJetsum_unweighted= ', GJetsum_unweighted) print('QCDsum_unweighted= ', QCDsum_unweighted) print('DYsum_unweighted= ', DYsum_unweighted) print('TTGsJetssum_unweighted= ', TTGsJetssum_unweighted) print('WGsJetssum_unweighted= ', WGsJetssum_unweighted) print('WWsum_unweighted= ', WWsum_unweighted) print('bckgsum_unweighted= ', bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID'] == 'Hgg', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'DiPhoton', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'GJet', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'QCD', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'DY', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'TTGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'WGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'WW', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) # Remove column headers that aren't input variables training_columns = column_headers[:-6] print('<train-DNN> Training features: ', training_columns) column_order_txt = '%s/column_order.txt' % (output_directory) column_order_file = open(column_order_txt, "wb") for tc_i in training_columns: line = tc_i + "\n" pickle.dump(str(line), column_order_file) num_variables = len(training_columns) # Extract training and testing data X_train = traindataset[training_columns].values X_test = valdataset[training_columns].values # Extract labels data Y_train = traindataset['target'].values Y_test = valdataset['target'].values # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] # Event weights if wanted train_weights = traindataset['weight'].values test_weights = valdataset['weight'].values # Weights applied during training. if weights == 'BalanceYields': trainingweights = traindataset.loc[:, 'classweight'] * traindataset.loc[:, 'weight'] if weights == 'BalanceNonWeighted': trainingweights = traindataset.loc[:, 'classweight'] trainingweights = np.array(trainingweights) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name + '.png') Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name + '.pdf') # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) if do_model_fit == 1: print('<train-BinaryDNN> Training new model . . . . ') histories = [] labels = [] if hyp_param_scan == 1: print('Begin at local time: ', time.localtime()) hyp_param_scan_name = 'hyp_param_scan_results.txt' hyp_param_scan_results = open(hyp_param_scan_name, 'a') time_str = str(time.localtime()) + '\n' hyp_param_scan_results.write(time_str) hyp_param_scan_results.write(weights) learn_rates = [0.00001, 0.0001] epochs = [150, 200] batch_size = [400, 500] param_grid = dict(learn_rate=learn_rates, epochs=epochs, batch_size=batch_size) model = KerasClassifier(build_fn=gscv_model, verbose=0) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) grid_result = grid.fit(X_train, Y_train, shuffle=True, sample_weight=trainingweights) print("Best score: %f , best params: %s" % (grid_result.best_score_, grid_result.best_params_)) hyp_param_scan_results.write( "Best score: %f , best params: %s\n" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean, stdev, param)) hyp_param_scan_results.write( "Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean, stdev, param)) exit() else: # Define model for analysis early_stopping_monitor = EarlyStopping(patience=100, monitor='val_loss', min_delta=0.01, verbose=1) #model = baseline_model(num_variables, learn_rate=learn_rate) model = new_model(num_variables, learn_rate=learn_rate) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epoch = One pass over data (useful for periodic logging and evaluation) #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train)) history = model.fit(X_train, Y_train, validation_split=validation_split, epochs=epochs, batch_size=batch_size, verbose=1, shuffle=True, sample_weight=trainingweights, callbacks=[early_stopping_monitor]) histories.append(history) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename + '.png') Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename + '.pdf') Plotter.history_plot(history, label='loss') Plotter.save_plots(dir=plots_dir, filename='history_loss.png') Plotter.save_plots(dir=plots_dir, filename='history_loss.pdf') else: model_name = os.path.join(output_directory, 'model.h5') model = load_trained_model(model_name) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory, 'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory, 'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory, 'model_serialised.json') with open(model_json_name, 'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory, 'model_schematic.png') #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) print('================') print('Training event labels: ', len(Y_train)) print('Training event probs', len(result_probs)) print('Training event weights: ', len(train_weights)) print('Testing events: ', len(Y_test)) print('Testing event probs', len(result_probs_test)) print('Testing event weights: ', len(test_weights)) print('================') # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory Plotter.ROC(model, X_test, Y_test, X_train, Y_train) Plotter.save_plots(dir=plots_dir, filename='ROC.png') Plotter.save_plots(dir=plots_dir, filename='ROC.pdf')
def main(): ######################## ### Parse Input Args ### ######################## parser = argparse.ArgumentParser( description='CNN classification code implemented using TensorFlow v2.0', epilog='https://github.com/azodichr') parser.add_argument('-x', help='Feature numpy dataset', required=True) parser.add_argument('-y', help='Class/Y numpy dataset', required=True) parser.add_argument('-run', help='T/F to run final models', default='t') parser.add_argument('-splits', help='Values for train/val/test', default='70,10,20') parser.add_argument('-y_name', help='Phenotype Trait') parser.add_argument('-f', help='Function: gs, run, full', default='full') parser.add_argument('-save', help='Name for Output File', default='test') parser.add_argument('-balance', help='t/f to downsample so balance classes', default='t') parser.add_argument('-n_channels', help='Number of channels', default=1, type=int) parser.add_argument('-cv', help='Number of cross validation folds', type=int, default=5) parser.add_argument('-n_jobs', '-p', help='Number of processors for ' 'parallel computing (max for HPCC = 14)', type=int, default=1) parser.add_argument('-save_model', help='T/F if want to save final models', type=str, default='f') parser.add_argument('-tag', help='Identifier String to add to RESULTS', type=str, default='cnn') parser.add_argument('-save_detailed', help='T/F Save detailed model performance', type=str, default='f') parser.add_argument('-original_df', help='DF fed into input_converter.py', type=str, default='') parser.add_argument('-imp_m', help='T/F to calculate importance of each motif', type=str, default='f') parser.add_argument('-imp_k', help='T/F to calculate importance of each kernel', type=str, default='f') # Default Hyperparameters parser.add_argument('-params', help='Output from -f gs (i.e. ' 'SAVE_GridSearch.txt)', default='default') parser.add_argument('-actfun', help='Activation function. (relu, sigmoid)', default='relu') parser.add_argument('-learn_rate', help='Learning Rate', default=0.01, type=float) parser.add_argument('-dropout', help='Dropout rate', default=0.25, type=float) parser.add_argument('-l2', help='Shrinkage parameter for L2 regularization', default=0.25, type=float) parser.add_argument('-filters', help='Number of Kernels/filters', default=8, type=int) parser.add_argument('-optimizer', help='Optimization function to use)', type=str, default='Adam') parser.add_argument('-dense', help='Number of nodes in dense layer', type=int, default=16) parser.add_argument('-activation', help='Activation function in all but ' 'last dense layer, which is set to linear', type=str, default='relu') parser.add_argument('-n_reps', '-n', help='Number of replicates (unique ' 'validation set/starting weights for each)', default=100, type=int) parser.add_argument('-clip_value', help='Clip Value', type=float, default=0.5) parser.add_argument('-patience', help='Patience for Early Stopping', type=int, default=5) parser.add_argument('-min_delta', help='Minimum Delta Value for Early ' 'Stopping', type=float, default=0) # Grid Search reps/space parser.add_argument('-gs_reps', '-gs_n', help='Number of Grid Search Reps' '(will append results if SAVE_GridSearch.csv exists)', type=int, default=10) parser.add_argument('-actfun_gs', help='Activation functions for Grid ' 'Search', nargs='*', default=['relu', 'selu', 'elu']) parser.add_argument('-dropout_gs', help='Dropout rates for Grid Search', nargs='*', type=float, default=[0.0, 0.1, 0.25]) parser.add_argument('-l2_gs', help='Shrinkage parameters for L2 for Grid ' 'Search', nargs='*', type=float, default=[0.01, 0.1, 0.25]) parser.add_argument('-lrate_gs', help='Learning Rate', nargs='*', type=float, default=[0.1, 0.01, 0.001, 0.0001]) parser.add_argument('-kernels_gs', help='Number of Kernels for Grid Search', default=[4, 8, 16, 24], type=int) args = parser.parse_args() k_height = 'tmp' args.k_len = 'tmp' def downsample(x, y): unique, counts = np.unique(y_all, return_counts=True) smaller_index = list(counts).index(min(counts)) bigger_index = list(counts).index(max(counts)) i_smaller = np.where(y_all == unique[smaller_index])[0] i_bigger = np.where(y_all == unique[bigger_index])[0] downsample_n = len(i_smaller) i_bigger_downsampled = np.random.choice(i_bigger, size=downsample_n, replace=False) i_keep = list(i_smaller) + list(i_bigger_downsampled) y = y_all[i_keep] x = x_all[i_keep] return x, y def make_cnn_model(learn_rate=args.learn_rate, filters=args.filters, dropout=args.dropout, dense=args.dense, l2=args.l2, activation=args.activation, optimizer=args.optimizer, units=1): if optimizer.lower() == 'adam': opt = tf.keras.optimizers.Adam(lr=learn_rate, clipvalue=args.clip_value) elif optimizer.lower() == 'nadam': opt = tf.keras.optimizers.Nadam(lr=learn_rate, clipvalue=args.clip_value) elif optimizer.lower() == 'rmsprop': opt = tf.keras.optimizers.RMSprop(lr=learn_rate, clipvalue=args.clip_value) elif optimizer.lower() == 'sgdm': opt = tf.keras.optimizers.SGD(lr=learn_rate, decay=1e-6, clipvalue=args.clip_value, momentum=0.9, nesterov=True) conv2d_layer = layers.Conv2D( filters=filters, kernel_size=tuple([k_height, 1]), kernel_regularizer=tf.keras.regularizers.l2(l2), activation=activation, kernel_initializer='glorot_normal', input_shape=(n_rows, n_columns, args.n_channels), name='conv2d_layer') K.clear_session() model = models.Sequential() model.add(conv2d_layer) model.add(layers.Flatten()) model.add(layers.Dense(dense, activation=activation)) model.add(layers.Dropout(dropout)) model.add(layers.Dense(units=1, activation='sigmoid')) model.compile(optimizer=opt, loss='binary_crossentropy') return model, conv2d_layer ########################## ### Data preprocessing ### ########################## x_all = np.load(args.x) y_all = np.load(args.y) x_all = x_all.reshape(x_all.shape + (args.n_channels, )) if args.balance.lower() in ['t', 'true']: x, y = downsample(x_all, y_all) print('Y shape (down-sampled): %s' % str(y.shape)) print('X shape (down-sampled): %s' % str(x.shape)) else: y = y_all x = x_all print("\nSnapshot of feature data for first instance in data set:") print(x[0, :, 0:5, 0]) n = y.shape[0] n_rows = x.shape[1] n_columns = x.shape[2] k_height = x.shape[1] args.k_len = 1 print('Kernel dimensions: ', k_height, args.k_len) ################### ### Grid Search ### ################### if args.params.lower() == 'gs': print('\n***** Starting Random Search with %i reps using %i testing ' 'instances and %i fold cross-validation *****\n' % (args.gs_reps, x.shape[0], args.cv)) scoring = {'acc': 'accuracy', 'f1': 'f1'} param_grid = dict( learn_rate=[0.1, 0.01, 0.001], filters=[8, 16], dense=[8, 16, 32], l2=[0.1, 0.25], #, 0.5], dropout=[0.1, 0.25], #, 0.5], activation=["relu"], #, 'selu', 'elu'], optimizer=['RMSprop', 'Adam', 'nadam']) model, conv2d_layer = KerasClassifier(build_fn=make_cnn_model, batch_size=100, epochs=30, verbose=0) rand_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=args.cv, n_iter=args.gs_reps, n_jobs=args.n_jobs, scoring=scoring, refit='acc', verbose=0) gs_result = rand_search.fit(x, y) gs_result_df = pd.DataFrame.from_dict(gs_result.cv_results_) print("Saving Grid Search Results....") print(gs_result_df.head()) with open(args.save + "_GridSearch.txt", 'a') as out_gs: gs_result_df.to_csv(out_gs, header=out_gs.tell() == 0, sep='\t') print('\n\n Grid Search results saved to: %s_GridSearch.txt\n' % args.save) ################ ### Run final model ################ if args.run.lower() in ['t', 'true']: print('####### Running Final Model(s) ###########') # Step 1: Define the parameters from the Grid Search or use default if args.params.lower() != 'default': if args.params.lower() != 'gs': gs_result_df = pd.read_csv(args.params, sep='\t') gs_result_df.fillna(0, inplace=True) gs_mean = gs_result_df.groupby([ 'param_filters', 'param_optimizer', 'param_learn_rate', 'param_dropout', 'param_l2', 'param_dense', 'param_activation' ]).agg({ 'mean_test_acc': 'mean', 'std_test_acc': 'mean', 'mean_fit_time': 'count' }).reset_index() print('Parameter Search Coverage: \nMin: %i\nMean: %3f\nMax:%i' % (gs_mean['mean_fit_time'].min(), gs_mean['mean_fit_time'].mean(), gs_mean['mean_fit_time'].max())) if gs_mean['mean_fit_time'].min() == 1: print('Dropping parameter combinations with < 2 replicates...') gs_mean = gs_mean[gs_mean['mean_fit_time'] >= 2] gs_mean = gs_mean.sort_values(by='mean_test_acc', ascending=False) print('\nSnapshot of grid search results:') print(gs_mean.head()) args.learn_rate = float(gs_mean['param_learn_rate'].iloc[0]) args.l2 = float(gs_mean['param_l2'].iloc[0]) args.dropout = float(gs_mean['param_dropout'].iloc[0]) args.filters = int(gs_mean['param_filters'].iloc[0]) args.dense = int(gs_mean['param_dense'].iloc[0]) args.activation = gs_mean['param_activation'].iloc[0] args.optimizer = gs_mean['param_optimizer'].iloc[0] print('\n***** Running CNN models ******') print('Optimizer: %s\nActivation function:' ' %s\nLearning Rate: %4f\nNumber of kernels: ' '%i\nL2: %4f\nDropout: %4f\nDense nodes: %s\n' % (args.optimizer, args.activation, args.learn_rate, args.filters, args.l2, args.dropout, args.dense)) final_results = pd.DataFrame() motif_imps = pd.DataFrame() kern_imp = [] for n in range(args.n_reps): print("\nReplicate %i/%i" % (n, args.n_reps)) x, y = downsample(x_all, y_all) print(x.shape) model, conv2d_layer = make_cnn_model(learn_rate=args.learn_rate, optimizer='sgdm', filters=args.filters, dense=args.dense, l2=args.l2, dropout=args.dropout, activation=args.activation) #print(model.summary()) # Step 3: Split training into training2 and validation x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.1) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, test_size=0.111) print('Train on %i, validate on %i, test on %i' % (x_train.shape[0], x_val.shape[0], x_test.shape[0])) # Step 4: Define optimizer and early stopping criteria & train model.compile(optimizer=args.optimizer, loss='binary_crossentropy', metrics=['accuracy']) earlystop_callback = EarlyStopping(monitor='val_loss', mode='min', min_delta=args.min_delta, patience=args.patience, restore_best_weights=True, verbose=0) model.fit(x_train, y_train, batch_size=50, epochs=1000, verbose=0, callbacks=[earlystop_callback], validation_data=(x_val, y_val)) train_loss, train_acc = model.evaluate(x_train, y_train) val_loss, val_acc = model.evaluate(x_val, y_val) test_loss, test_acc = model.evaluate(x_test, y_test) val_yhat = model.predict(x_val) max_f1 = 0 best_thresh = 0 for thr in np.arange(0.01, 1, 0.01): thr_pred = val_yhat.copy() thr_pred[thr_pred >= thr] = 1 thr_pred[thr_pred < thr] = 0 if sum( thr_pred ) > 1: # Eliminates cases where all predictions are negative and the f1 and auROC are undefined f1 = f1_score(y_val, thr_pred, pos_label=1) # Returns F1 for positive class if f1 >= max_f1: max_f1 = f1 best_thresh = thr print('Threshold for F1 measure: %3f' % best_thresh) # Calculate AUC-ROC and F-measure from train, val, and test. yhat_train = model.predict(x_train) train_auroc = roc_auc_score(y_train, yhat_train) yhat_train[yhat_train >= best_thresh] = 1 yhat_train[yhat_train < best_thresh] = 0 train_f1 = f1_score(y_train, yhat_train, pos_label=1) yhat_val = model.predict(x_val) val_auroc = roc_auc_score(y_val, yhat_val) yhat_val[yhat_val >= best_thresh] = 1 yhat_val[yhat_val < best_thresh] = 0 val_f1 = f1_score(y_val, yhat_val, pos_label=1) yhat_test = model.predict(x_test) test_auroc = roc_auc_score(y_test, yhat_test) yhat_test[yhat_test >= best_thresh] = 1 yhat_test[yhat_test < best_thresh] = 0 test_f1 = f1_score(y_test, yhat_test, pos_label=1) if args.save_model.lower() in ['t', 'true']: model.save(args.save + '_model_' + str(n) + '.h5') final_results = final_results.append( { 'ID': args.save, 'Tag': args.tag, 'Rep': n, 'X_file': args.x, 'Y_file': args.y, 'ActFun': args.activation, 'dropout': args.dropout, 'L2': args.l2, 'LearnRate': args.learn_rate, 'Optimizer': args.optimizer, 'n_Kernels': args.filters, 'F1_threshold': best_thresh, 'n_Dense': args.dense, 'Acc_train': train_acc, 'Loss_train': train_loss, 'auROC_train': train_auroc, 'F1_train': train_f1, 'Acc_val': val_acc, 'Loss_val': val_loss, 'auROC_val': val_auroc, 'F1_val': val_f1, 'Acc_test': test_acc, 'Loss_test': test_loss, 'auROC_test': test_auroc, 'F1_test': test_f1 }, ignore_index=True) ########################## ## Model Interpretation ## ########################## if (args.imp_m.lower() in ['t', 'true'] or args.imp_k.lower() in ['t', 'true']): # Step 1: Read in x data meta data key = pd.read_csv( args.original_df, sep='\t', index_col=0, ) key_index_list = key.columns.str.split('_', expand=True).values key.columns = pd.MultiIndex.from_tuples([ (x[1], x[0]) for x in key_index_list ]) key = key.sort_index(axis=1) motifs = key.columns.levels[0].values omic_stack = list(key[list(key.columns.levels[0])[0]]) omic_stack.append('PA') # Calculate Motif importance (zero-out-each-feature) if args.imp_m.lower() in ['t', 'true']: motif_imp = np.empty((0, 2)) model_mot_imp = model for mx in range(0, x_test.shape[2] - 1): x_test_tmp = np.copy(x_test) x_test_tmp[:, ..., mx, :] = 0 yhat_m_imp = model_mot_imp.predict(x_test_tmp) auroc_m_imp = roc_auc_score(y_test, yhat_m_imp) imp_m_auc = test_auroc - auroc_m_imp motif_imp = np.vstack( (motif_imp, np.array([motifs[mx], imp_m_auc]))) motif_imp = pd.DataFrame( motif_imp, columns=['motif', 'auROC_test_decrease']) if n == 0: motif_imps = motif_imp else: motif_imps = pd.merge(motif_imps, motif_imp, on='motif') # Calculate Kernel Importance (zero-out-weights) if args.imp_k.lower() in ['t', 'true']: all_weights = model.get_weights() all_weights_2 = all_weights.copy() print( 'Performing Leave-One-Kernel-Out importance analysis...' ) for kx in range(0, args.filters): orig_weights = all_weights[0][:, :, 0, kx].copy() orig_weights = orig_weights.tolist() orig_weights = [i for l in orig_weights for i in l] conv2d_drop = copy.deepcopy(all_weights) conv2d_drop[0][:, :, 0, kx] = 0.0 print(conv2d_drop[0][1, :, 0, 0:10]) model_LOKO = tf.keras.models.clone_model(model) model_LOKO.set_weights(weights=conv2d_drop) yhat_k_imp = model_LOKO.predict(x_test) auroc_k_imp = roc_auc_score(y_test, yhat_k_imp) imp_k_auc = test_auroc - auroc_k_imp old = roc_auc_score(y_test, model.predict(x_test)) print(old, imp_k_auc) kern_imp.append([n, imp_k_auc, orig_weights]) if args.imp_m.lower() in ['t', 'true']: print('Snapshor ot motif importance scores...') motif_imps = motif_imps.set_index('motif') motif_imps = motif_imps.apply(pd.to_numeric, errors='coerce') motif_imps['mean_imp'] = motif_imps.mean(axis=1) motif_imps = motif_imps.sort_values('mean_imp', 0, ascending=False) print(motif_imps['mean_imp'].head()) motif_imps['mean_imp'].to_csv(args.save + "_Motif_imp", sep="\t", index=True) if args.imp_k.lower() in ['t', 'true']: print('\nSnapshot of kernel importance scores:') kern_imp = pd.DataFrame( kern_imp, columns=['rep', 'auROC_test_decrease', 'kernel']) print(kern_imp.head()) kern_imp.to_csv(args.save + "_Kernel_imp", sep="\t", index=True) final_results.to_csv(args.save + "_results.txt", header=True, sep='\t') # Save summary of results to RESULTS.txt calc_cols = [ 'F1_threshold', 'Acc_train', 'Acc_val', 'Acc_test', 'Loss_train', 'Loss_val', 'Loss_test', 'auROC_train', 'auROC_val', 'auROC_test', 'F1_train', 'F1_val', 'F1_test' ] final_results = final_results.drop(['Rep'], axis=1) std = final_results[calc_cols].std(axis=0, skipna=True) std = std.add_suffix('_std') mean = final_results[calc_cols].mean(axis=0, skipna=True) mean = mean.add_suffix('_mean') str_cols = final_results.drop(calc_cols, axis=1).iloc[0] str_cols = str_cols.append(pd.Series([args.n_reps], index=['Reps'])) summary = pd.concat([str_cols, mean, std]) #summary.set_index('index', inplace=True) print('\n### Summary of results on test set ###') print(summary.filter(like='test_mean', axis=0)) with open("RESULTS.txt", 'a') as f: summary.to_frame().transpose().to_csv(f, header=f.tell() == 0, sep='\t') print('Done!')
print dummy_y print X.shape print X # define baseline model def baseline_model(): # create model model = Sequential() model.add(Dense(4, input_dim=4, init='normal', activation='relu')) model.add(Dense(3, init='normal', activation='sigmoid')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=200, batch_size=5, verbose=0) estimator.save("estimator.h5") kfold = KFold(n_splits=10, shuffle=True, random_state=seed) results = cross_val_score(estimator, X, dummy_y, cv=kfold) print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))