model.add(Dropout(0.5)) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train_elo, Y_train_elo, epochs=10, batch_size=50, validation_split=0.2, verbose=1) model.test_on_batch(X_test_elo, Y_test_elo, sample_weight=None) model.evaluate(X_test_elo, Y_test_elo, verbose=1) pred = model.predict_classes(X_test_elo, verbose=1) plot_model(model, to_file='model.png', show_shapes=True) SVG(model_to_dot(model).create(prog='dot', format='svg')) print(confusion_matrix(Y_test_elo, pred)) print classification_report(Y_test_elo, pred) print(accuracy_score(Y_test_elo, pred)) fpr_elo, tpr_elo, thresholds_elo = roc_curve(Y_test_elo, pred) auc = auc(fpr_elo, tpr_elo) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_elo,
# In[ ]: plot_loss(model_output.history, COURSE_LIST[course_idx]) # In[ ]: plot_accuracy(model_output.history, COURSE_LIST[course_idx]) # In[ ]: course_metrics['course_name'].append(COURSE_LIST[course_idx]) course_metrics['val_binary_accuracy'].append( model_output.history['val_binary_accuracy'][-1]) course_metrics['test_accuracy'].append( accuracy_score(model.predict_classes(features_test), labels_test)) course_metrics['test_f1_score'].append( f1_score(model.predict_classes(features_test), labels_test)) # ### 2. CS50x - Introduction to Computer Science I # In[ ]: course_idx = 1 print(COURSE_LIST[course_idx]) # In[ ]: course_loc = DATA_DIR + COURSE_LIST[course_idx] print(course_loc)
import numpy as np model = load_model('saved_model.h5') test_data = [ "A lot of good things are happening. We are respected again throughout the world, and that's a great thing" ] max_features = 200 tokenizer = Tokenizer(num_words=max_features, split=' ') tokenizer.fit_on_texts(test_data) X = tokenizer.texts_to_sequences(test_data) max_len = 28 X = pad_sequences(X, maxlen=max_len) class_names = ['positive', 'negative'] preds = model.predict(X) print(preds) classes = model.predict_classes(X) print(classes) print(class_names[classes[0]]) import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from sklearn.feature_extraction.text import CountVectorizer from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D from sklearn.model_selection import train_test_split from keras.utils.np_utils import to_categorical import re
# summarize history for loss from matplotlib import pyplot as plt plt.plot(model_history.history['loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() """We will check for the acuraccy on testing dataset""" model.evaluate(test_images,to_categorical(test_labels)) """### Prediction""" ans=model.predict(test_kaggle) import numpy as np ans=np.argmax(ans,axis=1) ans[:5] predicted_classes = model.predict_classes(test_kaggle) submissions=pd.DataFrame({"ImageId": list(range(1,len(predicted_classes)+1)), "Label": predicted_classes}) submissions.to_csv("subbmision2.csv", index=False, header=True) ls
print "class weights = ", class_weights #Now we should create classifier object using our internal classifier object in the function above classifier = KerasClassifier(build_fn=classifier_builder, batch_size=16, nb_epoch=1) if (os.access("lstm_model.h5", os.F_OK)): classifier = load_model('lstm_model.h5') for i in range(0, runLoop): hist = classifier.fit(X_train, y_train, batch_size=256, epochs=runEpoch, class_weight=class_weights, validation_data=(X_test, y_test)) print "loop i=", i, "hist:", hist.history y_predict = classifier.predict_classes(X_test, batch_size=256) y_predict = [j[0] for j in y_predict] precision = precision_score(y_test, y_predict, average='macro') recall = recall_score(y_test, y_predict, average='macro') print("Precision:", precision) print("Recall:", recall) if (os.access("lstm_model.h5", os.F_OK)): print(classifier.summary()) classifier.save('lstm_model.h5') else: print(classifier.model.summary()) classifier.model.save('lstm_model.h5')
def main(): print('Using Keras version: ', keras.__version__) usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-t', '--train_model', dest='train_model', help= 'Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int) parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str) parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int) parser.add_argument( '-i', '--inputs_file_path', dest='inputs_file_path', help= 'Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str) args = parser.parse_args() do_model_fit = args.train_model suffix = args.suffix # Create instance of the input files directory #inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/' inputs_file_path = '/eos/user/b/bmarzocc/HHWWgg/January_2021_Production/2017/' hyp_param_scan = args.hyp_param_scan # Set model hyper-parameters weights = 'BalanceYields' # 'BalanceYields' or 'BalanceNonWeighted' optimizer = 'Nadam' validation_split = 0.1 # hyper-parameter scan results if weights == 'BalanceNonWeighted': learn_rate = 0.0005 epochs = 200 batch_size = 200 if weights == 'BalanceYields': learn_rate = 0.0001 epochs = 200 batch_size = 32 #epochs = 10 #batch_size=200 # Create instance of output directory where all results are saved. output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix, weights) check_dir(output_directory) hyperparam_file = os.path.join(output_directory, 'additional_model_hyper_params.txt') additional_hyperparams = open(hyperparam_file, 'w') additional_hyperparams.write("optimizer: " + optimizer + "\n") additional_hyperparams.write("learn_rate: " + str(learn_rate) + "\n") additional_hyperparams.write("epochs: " + str(epochs) + "\n") additional_hyperparams.write("validation_split: " + str(validation_split) + "\n") additional_hyperparams.write("weights: " + weights + "\n") # Create plots subdirectory plots_dir = os.path.join(output_directory, 'plots/') input_var_jsonFile = open('input_variables.json', 'r') selection_criteria = '( (Leading_Photon_pt/CMS_hgg_mass) > 1/3 && (Subleading_Photon_pt/CMS_hgg_mass) > 1/4 )' # Load Variables from .json variable_list = json.load(input_var_jsonFile, encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key, var in variable_list: column_headers.append(key) column_headers.append('weight') column_headers.append('unweighted') column_headers.append('target') column_headers.append('key') column_headers.append('classweight') column_headers.append('process_ID') # Load ttree into .csv including all variables listed in column_headers print('<train-DNN> Input file path: ', inputs_file_path) outputdataframe_name = '%s/output_dataframe.csv' % (output_directory) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)) else: print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)) data = load_data(inputs_file_path, column_headers, selection_criteria) # Change sentinal value to speed up training. data = data.mask(data < -25., -9.) #data = data.replace(to_replace=-99.,value=-9.0) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) print('<main> data columns: ', (data.columns.values.tolist())) n = len(data) nHH = len(data.iloc[data.target.values == 1]) nbckg = len(data.iloc[data.target.values == 0]) print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg)) # Make instance of plotter tool Plotter = plotter() # Create statistically independant training/testing data traindataset, valdataset = train_test_split(data, test_size=0.1) valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False) print('<train-DNN> Training dataset shape: ', traindataset.shape) print('<train-DNN> Validation dataset shape: ', valdataset.shape) # Event weights weights_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH', 'weight'] weights_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg', 'weight'] weights_for_DiPhoton = traindataset.loc[traindataset['process_ID'] == 'DiPhoton', 'weight'] weights_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet', 'weight'] weights_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD', 'weight'] weights_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY', 'weight'] weights_for_TTGsJets = traindataset.loc[traindataset['process_ID'] == 'TTGsJets', 'weight'] weights_for_WGsJets = traindataset.loc[traindataset['process_ID'] == 'WGsJets', 'weight'] weights_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW', 'weight'] HHsum_weighted = sum(weights_for_HH) Hggsum_weighted = sum(weights_for_Hgg) DiPhotonsum_weighted = sum(weights_for_DiPhoton) GJetsum_weighted = sum(weights_for_GJet) QCDsum_weighted = sum(weights_for_QCD) DYsum_weighted = sum(weights_for_DY) TTGsJetssum_weighted = sum(weights_for_TTGsJets) WGsJetssum_weighted = sum(weights_for_WGsJets) WWsum_weighted = sum(weights_for_WW) bckgsum_weighted = Hggsum_weighted + DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted #bckgsum_weighted = DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted nevents_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH', 'unweighted'] nevents_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg', 'unweighted'] nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID'] == 'DiPhoton', 'unweighted'] nevents_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet', 'unweighted'] nevents_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD', 'unweighted'] nevents_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY', 'unweighted'] nevents_for_TTGsJets = traindataset.loc[traindataset['process_ID'] == 'TTGsJets', 'unweighted'] nevents_for_WGsJets = traindataset.loc[traindataset['process_ID'] == 'WGsJets', 'unweighted'] nevents_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW', 'unweighted'] HHsum_unweighted = sum(nevents_for_HH) Hggsum_unweighted = sum(nevents_for_Hgg) DiPhotonsum_unweighted = sum(nevents_for_DiPhoton) GJetsum_unweighted = sum(nevents_for_GJet) QCDsum_unweighted = sum(nevents_for_QCD) DYsum_unweighted = sum(nevents_for_DY) TTGsJetssum_unweighted = sum(nevents_for_TTGsJets) WGsJetssum_unweighted = sum(nevents_for_WGsJets) WWsum_unweighted = sum(nevents_for_WW) bckgsum_unweighted = Hggsum_unweighted + DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted #bckgsum_unweighted = DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted HHsum_weighted = 2 * HHsum_weighted HHsum_unweighted = 2 * HHsum_unweighted if weights == 'BalanceYields': print('HHsum_weighted= ', HHsum_weighted) print('Hggsum_weighted= ', Hggsum_weighted) print('DiPhotonsum_weighted= ', DiPhotonsum_weighted) print('GJetsum_weighted= ', GJetsum_weighted) print('QCDsum_weighted= ', QCDsum_weighted) print('DYsum_weighted= ', DYsum_weighted) print('TTGsJetssum_weighted= ', TTGsJetssum_weighted) print('WGsJetssum_weighted= ', WGsJetssum_weighted) print('WWsum_weighted= ', WWsum_weighted) print('bckgsum_weighted= ', bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'HH', ['classweight']] = HHsum_unweighted / HHsum_weighted traindataset.loc[traindataset['process_ID'] == 'Hgg', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'DiPhoton', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'GJet', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'QCD', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'DY', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'TTGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'WGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'WW', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) if weights == 'BalanceNonWeighted': print('HHsum_unweighted= ', HHsum_unweighted) print('Hggsum_unweighted= ', Hggsum_unweighted) print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted) print('GJetsum_unweighted= ', GJetsum_unweighted) print('QCDsum_unweighted= ', QCDsum_unweighted) print('DYsum_unweighted= ', DYsum_unweighted) print('TTGsJetssum_unweighted= ', TTGsJetssum_unweighted) print('WGsJetssum_unweighted= ', WGsJetssum_unweighted) print('WWsum_unweighted= ', WWsum_unweighted) print('bckgsum_unweighted= ', bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID'] == 'Hgg', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'DiPhoton', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'GJet', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'QCD', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'DY', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'TTGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'WGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'WW', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) # Remove column headers that aren't input variables training_columns = column_headers[:-6] print('<train-DNN> Training features: ', training_columns) column_order_txt = '%s/column_order.txt' % (output_directory) column_order_file = open(column_order_txt, "wb") for tc_i in training_columns: line = tc_i + "\n" pickle.dump(str(line), column_order_file) num_variables = len(training_columns) # Extract training and testing data X_train = traindataset[training_columns].values X_test = valdataset[training_columns].values # Extract labels data Y_train = traindataset['target'].values Y_test = valdataset['target'].values # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] # Event weights if wanted train_weights = traindataset['weight'].values test_weights = valdataset['weight'].values # Weights applied during training. if weights == 'BalanceYields': trainingweights = traindataset.loc[:, 'classweight'] * traindataset.loc[:, 'weight'] if weights == 'BalanceNonWeighted': trainingweights = traindataset.loc[:, 'classweight'] trainingweights = np.array(trainingweights) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name + '.png') Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name + '.pdf') # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) if do_model_fit == 1: print('<train-BinaryDNN> Training new model . . . . ') histories = [] labels = [] if hyp_param_scan == 1: print('Begin at local time: ', time.localtime()) hyp_param_scan_name = 'hyp_param_scan_results.txt' hyp_param_scan_results = open(hyp_param_scan_name, 'a') time_str = str(time.localtime()) + '\n' hyp_param_scan_results.write(time_str) hyp_param_scan_results.write(weights) learn_rates = [0.00001, 0.0001] epochs = [150, 200] batch_size = [400, 500] param_grid = dict(learn_rate=learn_rates, epochs=epochs, batch_size=batch_size) model = KerasClassifier(build_fn=gscv_model, verbose=0) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) grid_result = grid.fit(X_train, Y_train, shuffle=True, sample_weight=trainingweights) print("Best score: %f , best params: %s" % (grid_result.best_score_, grid_result.best_params_)) hyp_param_scan_results.write( "Best score: %f , best params: %s\n" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean, stdev, param)) hyp_param_scan_results.write( "Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean, stdev, param)) exit() else: # Define model for analysis early_stopping_monitor = EarlyStopping(patience=100, monitor='val_loss', min_delta=0.01, verbose=1) #model = baseline_model(num_variables, learn_rate=learn_rate) model = new_model(num_variables, learn_rate=learn_rate) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epoch = One pass over data (useful for periodic logging and evaluation) #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train)) history = model.fit(X_train, Y_train, validation_split=validation_split, epochs=epochs, batch_size=batch_size, verbose=1, shuffle=True, sample_weight=trainingweights, callbacks=[early_stopping_monitor]) histories.append(history) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename + '.png') Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename + '.pdf') Plotter.history_plot(history, label='loss') Plotter.save_plots(dir=plots_dir, filename='history_loss.png') Plotter.save_plots(dir=plots_dir, filename='history_loss.pdf') else: model_name = os.path.join(output_directory, 'model.h5') model = load_trained_model(model_name) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory, 'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory, 'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory, 'model_serialised.json') with open(model_json_name, 'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory, 'model_schematic.png') #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) print('================') print('Training event labels: ', len(Y_train)) print('Training event probs', len(result_probs)) print('Training event weights: ', len(train_weights)) print('Testing events: ', len(Y_test)) print('Testing event probs', len(result_probs_test)) print('Testing event weights: ', len(test_weights)) print('================') # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory Plotter.ROC(model, X_test, Y_test, X_train, Y_train) Plotter.save_plots(dir=plots_dir, filename='ROC.png') Plotter.save_plots(dir=plots_dir, filename='ROC.pdf')
def main(): print('Using Keras version: ', keras.__version__) usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument('-t', '--train_model', dest='train_model', help='Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int) parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str) parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int) parser.add_argument('-i', '--inputs_file_path', dest='inputs_file_path', help='Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str) args = parser.parse_args() do_model_fit = args.train_model suffix = args.suffix # Create instance of the input files directory inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/' hyp_param_scan=args.hyp_param_scan # Set model hyper-parameters weights='BalanceYields'# 'BalanceYields' or 'BalanceNonWeighted' optimizer = 'Nadam' validation_split=0.1 # hyper-parameter scan results if weights == 'BalanceNonWeighted': learn_rate = 0.0005 epochs = 200 batch_size=200 if weights == 'BalanceYields': learn_rate = 0.0001 epochs = 200 batch_size=400 # Create instance of output directory where all results are saved. output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix,weights) check_dir(output_directory) hyperparam_file = os.path.join(output_directory,'additional_model_hyper_params.txt') additional_hyperparams = open(hyperparam_file,'w') additional_hyperparams.write("optimizer: "+optimizer+"\n") additional_hyperparams.write("learn_rate: "+str(learn_rate)+"\n") additional_hyperparams.write("epochs: "+str(epochs)+"\n") additional_hyperparams.write("validation_split: "+str(validation_split)+"\n") additional_hyperparams.write("weights: "+weights+"\n") # Create plots subdirectory plots_dir = os.path.join(output_directory,'plots/') input_var_jsonFile = open('input_variables.json','r') selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)' # selection_criteria = '(AtLeast4GoodJets0Lep==1)' # selection_criteria = '(passPhotonSels==1 && passbVeto==1 && ExOneLep==1 && goodJets==1)' #selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)' # Load Variables from .json variable_list = json.load(input_var_jsonFile,encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key,var in variable_list: column_headers.append(key) column_headers.append('weight') column_headers.append('unweighted') column_headers.append('target') column_headers.append('key') column_headers.append('classweight') column_headers.append('process_ID') # Create instance of the input files directory #inputs_file_path = '/afs/cern.ch/work/a/atishelm/public/ForJosh/2017_DataMC_ntuples_moreVars' inputs_file_path = '/eos/user/r/rasharma/post_doc_ihep/double-higgs/ntuples/September29/MVANtuples' #inputs_file_path = '/eos/user/a/atishelm/ntuples/HHWWgg_DataSignalMCnTuples/PromptPromptApplied/' #inputs_file_path = 'PromptPromptApplied/' # Load ttree into .csv including all variables listed in column_headers print('<train-DNN> Input file path: ', inputs_file_path) outputdataframe_name = '%s/output_dataframe.csv' %(output_directory) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)) else: print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)) data = load_data(inputs_file_path,column_headers,selection_criteria) # Change sentinal value to speed up training. data = data.replace(to_replace=-999.000000,value=-9.0) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) print('<main> data columns: ', (data.columns.values.tolist())) n = len(data) nHH = len(data.iloc[data.target.values == 1]) nbckg = len(data.iloc[data.target.values == 0]) print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg)) # Make instance of plotter tool Plotter = plotter() # Create statistically independant training/testing data traindataset, valdataset = train_test_split(data, test_size=0.1) valdataset.to_csv((output_directory+'valid_dataset.csv'), index=False) print('<train-DNN> Training dataset shape: ', traindataset.shape) print('<train-DNN> Validation dataset shape: ', valdataset.shape) # Event weights weights_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'weight'] weights_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'weight'] weights_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'weight'] weights_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'weight'] weights_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'weight'] weights_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'weight'] weights_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'weight'] weights_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'weight'] weights_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'weight'] HHsum_weighted= sum(weights_for_HH) GJetsum_weighted= sum(weights_for_GJet) DiPhotonsum_weighted= sum(weights_for_DiPhoton) TTGGsum_weighted= sum(weights_for_TTGG) TTGJetssum_weighted= sum(weights_for_TTGJets) TTJetssum_weighted= sum(weights_for_TTJets) WJetssum_weighted= sum(weights_for_WJets) ttHsum_weighted= sum(weights_for_ttH) DYsum_weighted= sum(weights_for_DY) #bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted+ttHsum_weighted bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted nevents_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'unweighted'] nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'unweighted'] nevents_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'unweighted'] nevents_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'unweighted'] nevents_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'unweighted'] nevents_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'unweighted'] nevents_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'unweighted'] nevents_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'unweighted'] nevents_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'unweighted'] HHsum_unweighted= sum(nevents_for_HH) GJetsum_unweighted= sum(nevents_for_GJet) DiPhotonsum_unweighted= sum(nevents_for_DiPhoton) TTGGsum_unweighted= sum(nevents_for_TTGG) TTGJetssum_unweighted= sum(nevents_for_TTGJets) TTJetssum_unweighted= sum(nevents_for_TTJets) WJetssum_unweighted= sum(nevents_for_WJets) ttHsum_unweighted= sum(nevents_for_ttH) DYsum_unweighted= sum(nevents_for_DY) #bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted+ttHsum_unweighted bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted if weights=='BalanceYields': print('HHsum_weighted= ' , HHsum_weighted) print('ttHsum_weighted= ' , ttHsum_weighted) print('DiPhotonsum_weighted= ', DiPhotonsum_weighted) print('WJetssum_weighted= ', WJetssum_weighted) print('DYsum_weighted= ', DYsum_weighted) print('GJetsum_weighted= ', GJetsum_weighted) print('bckgsum_weighted= ', bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_weighted/bckgsum_weighted) if weights=='BalanceNonWeighted': print('HHsum_unweighted= ' , HHsum_unweighted) print('ttHsum_unweighted= ' , ttHsum_unweighted) print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted) print('WJetssum_unweighted= ', WJetssum_unweighted) print('DYsum_unweighted= ', DYsum_unweighted) print('GJetsum_unweighted= ', GJetsum_unweighted) print('bckgsum_unweighted= ', bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) # Remove column headers that aren't input variables training_columns = column_headers[:-6] print('<train-DNN> Training features: ', training_columns) column_order_txt = '%s/column_order.txt' %(output_directory) column_order_file = open(column_order_txt, "wb") for tc_i in training_columns: line = tc_i+"\n" pickle.dump(str(line), column_order_file) num_variables = len(training_columns) # Extract training and testing data X_train = traindataset[training_columns].values X_test = valdataset[training_columns].values # Extract labels data Y_train = traindataset['target'].values Y_test = valdataset['target'].values # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.png' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) #################################################################################### # Weights applied during training. You will also need to update the class weights if # you are going to change the event weights applied. Introduce class weights and any # event weight you want to use here. #trainingweights = traindataset.loc[:,'classbalance']#*traindataset.loc[:,'weight'] #trainingweights = np.array(trainingweights) # Temp hack to be able to change class weights without remaking dataframe #for inde in xrange(len(trainingweights)): # newweight = 13243.0/6306.0 # trainingweights[inde]= newweight #print 'training event weight = ', trainingweights[0] # Event weights calculation so we can correctly apply event weights to diagnostic plots. # use seperate list because we don't want to apply class weights in plots. # Event weights if wanted train_weights = traindataset['weight'].values test_weights = valdataset['weight'].values # Weights applied during training. if weights=='BalanceYields': trainingweights = traindataset.loc[:,'classweight']*traindataset.loc[:,'weight'] if weights=='BalanceNonWeighted': trainingweights = traindataset.loc[:,'classweight'] trainingweights = np.array(trainingweights) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.pdf' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) if do_model_fit == 1: print('<train-BinaryDNN> Training new model . . . . ') histories = [] labels = [] if hyp_param_scan == 1: print('Begin at local time: ', time.localtime()) hyp_param_scan_name = 'hyp_param_scan_results.txt' hyp_param_scan_results = open(hyp_param_scan_name,'a') time_str = str(time.localtime())+'\n' hyp_param_scan_results.write(time_str) hyp_param_scan_results.write(weights) learn_rates=[0.00001, 0.0001] epochs = [150,200] batch_size = [400,500] param_grid = dict(learn_rate=learn_rates,epochs=epochs,batch_size=batch_size) model = KerasClassifier(build_fn=gscv_model,verbose=0) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) grid_result = grid.fit(X_train,Y_train,shuffle=True,sample_weight=trainingweights) print("Best score: %f , best params: %s" % (grid_result.best_score_,grid_result.best_params_)) hyp_param_scan_results.write("Best score: %f , best params: %s\n" %(grid_result.best_score_,grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean,stdev,param)) hyp_param_scan_results.write("Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean,stdev,param)) exit() else: # Define model for analysis early_stopping_monitor = EarlyStopping(patience=30, monitor='val_loss', verbose=1) model = baseline_model(num_variables, learn_rate=learn_rate) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epoch = One pass over data (useful for periodic logging and evaluation) #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train)) history = model.fit(X_train,Y_train,validation_split=validation_split,epochs=epochs,batch_size=batch_size,verbose=1,shuffle=True,sample_weight=trainingweights,callbacks=[early_stopping_monitor]) histories.append(history) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch.png' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename) else: model_name = os.path.join(output_directory,'model.h5') model = load_trained_model(model_name) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory,'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory,'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory,'model_serialised.json') with open(model_json_name,'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory,'model_schematic.eps') print "DEBUG: ",model_schematic_name plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) # plot_model(model, to_file='model_schematic.eps', show_shapes=True, show_layer_names=True) # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory ''' print('================') print('Training event labels: ', len(Y_train)) print('Training event probs', len(result_probs)) print('Training event weights: ', len(train_weights)) print('Testing events: ', len(Y_test)) print('Testing event probs', len(result_probs_test)) print('Testing event weights: ', len(test_weights)) print('================') ''' # Make overfitting plots of output nodes Plotter.binary_overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights) print "DEBUG: Y_train shape: ",Y_train.shape # # Get true process integers for training dataset # original_encoded_train_Y = [] # for i in xrange(len(result_probs)): # if Y_train[i][0] == 1: # original_encoded_train_Y.append(0) # if Y_train[i][1] == 1: # original_encoded_train_Y.append(1) # if Y_train[i][2] == 1: # original_encoded_train_Y.append(2) # if Y_train[i][3] == 1: # original_encoded_train_Y.append(3) # Get true class values for testing dataset # result_classes_test = newencoder.inverse_transform(result_classes_test) # result_classes_train = newencoder.inverse_transform(result_classes) e = shap.DeepExplainer(model, X_train[:400, ]) shap_values = e.shap_values(X_test[:400, ]) Plotter.plot_dot(title="DeepExplainer_sigmoid_y0", x=X_test[:400, ], shap_values=shap_values, column_headers=column_headers) Plotter.plot_dot_bar(title="DeepExplainer_Bar_sigmoid_y0", x=X_test[:400,], shap_values=shap_values, column_headers=column_headers) #e = shap.GradientExplainer(model, X_train[:100, ]) #shap_values = e.shap_values(X_test[:100, ]) #Plotter.plot_dot(title="GradientExplainer_sigmoid_y0", x=X_test[:100, ], shap_values=shap_values, column_headers=column_headers) #e = shap.KernelExplainer(model.predict, X_train[:100, ]) #shap_values = e.shap_values(X_test[:100, ]) #Plotter.plot_dot(title="KernelExplainer_sigmoid_y0", x=X_test[:100, ],shap_values=shap_values, column_headers=column_headers) #Plotter.plot_dot_bar(title="KernelExplainer_Bar_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers) #Plotter.plot_dot_bar_all(title="KernelExplainer_bar_All_Var_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers) # Create confusion matrices for training and testing performance # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png') # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'columns') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'columns') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TEST.png') # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'') # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'') # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png') Plotter.ROC_sklearn(Y_train, result_probs, Y_test, result_probs_test, 1 , 'BinaryClassifierROC',train_weights, test_weights)
model.summary() model.compile(loss='categorical_crossentropy', optimizer=optimizers.RMSprop(lr=1e-4), metrics=['acc']) history = model.fit(X_train, y_train, epochs=35, batch_size=150, validation_data=(X_val, y_val)) model.save('best_model_imag_3clases.h5') Y_pred_proba=model.predict(X_test) Y_pred=model.predict_classes(X_test) test_loss_trained_net, test_acc_trained_net = model.evaluate(X_test, y_test) print('test_acc:', test_acc_trained_net) #Resultado 75% de acuraccy sobre el test matrix = confusion_matrix(y_test.argmax(axis=1), Y_pred) labels_grouped=np.array(["Musica popular", "Musica melodica", "Musica ritmica"]) plot_confusion_matrix(y_test.argmax(axis=1), Y_pred, classes=labels_grouped, title='Confusion matrix, without normalization') #Graficamos el Loss (error) en función de los Epochs history_dict = history.history loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss']
def modeling(conn, sentences, lib, dz): #def modeling(conn, df, lib, dz): #pts = pd.read_sql("SELECT DISTINCT SUBJECT_ID from UFM", conn) #pts =list(set(pts.SUBJECT_ID)) #pool = [] #for d in dz: # pool += d.pos + d.neg np.random.seed(7) decay = .0002 data = []; train = []; test = [] keys = [k[1] for k in lib] admits = pd.read_sql("SELECT * from admissions", conn) for itr in range(0,5): print ("Sess: {0}".format(itr)) for d in dz: neg = random.sample(d[1], len(d[0])) temp = d[0] + neg random.shuffle(temp) t1, t2 = cross_validation.train_test_split(temp, test_size = .2) train +=t1; test +=t2 #X stands for raw indexes of feature input; V stands for raw feature input #W stands for word vectors from feature input trained by Word2Vec X_train = []; t_train = []; W_train = []; Y_train = [] X_test = []; t_test = []; W_test = []; Y_test = [] V_train = []; V_test = [] count=0 for t in train: print (count) count+=1 corpus = [[s[2], s[3]] for s in sentences if (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])] #order subject by time of entry for each sentence (admission) corpus = sorted(corpus, key = lambda x: x[1]) #transpose into nx2xd from 2xnxd #this way, corpus[0] refers to words and corpus[1] refers to times corpus = list(map(list, zip(*corpus))) x_train = list(chain.from_iterable(corpus[0])) t_stamps = list(chain.from_iterable(corpus[1])) x = np.array(list(map(lambda x: keys.index(x), x_train))) #configure each timestamp to reflect time elapsed from first time entry #calculate time decay from initial event temp = t_stamps[0] t_stamps = [ii-temp for ii in t_stamps] #append X_train.append(x) V_train.append(np.array(x_train)) t_train.append(np.array(t_stamps)) Y_train.append(t[3]) print ("X_train made.") count = 0 for t in test: print (count) count+=1 corpus = [[s[2], s[3]] for s in sentences if (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])] corpus = sorted(corpus, key = lambda x: x[1]) corpus = list(map(list, zip(*corpus))) x_test = list(chain.from_iterable(corpus[0])) t_stamps = list(chain.from_iterable(corpus[1])) temp = t_stamps[0] t_stamps = [ii-temp for ii in t_stamps] x = np.array(list(map(lambda x: keys.index(x), x_test))) X_test.append(x) V_test.append(np.array(x_train)) t_test.append(np.array(t_stamps)) Y_test.append(t[3]) #training normal LSTM and CNN-LSTM top_words = [9444] max_review_length = [1000] embedding_length = [300] X_train = sequence.pad_sequences(X_train, maxlen=max_review_length[0]) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length[0]) #build model using KerasClassifier and Gridsearch cnn = KerasClassifier(build_fn=cnn_train, verbose=1) lstm = KerasClassifier(build_fn=lstm_train, verbose=1) d_cnn = KerasClassifier(build_fn=d_cnn_train, verbose = 1) d_lstm = KerasClassifier(build_fn=d_lstm_train, verbose = 1) # define the grid search parameters batch_size = [32, 64, 128] epochs = [20, 50, 100, 200] optimizer = ['SGD', 'RMSprop', 'Adam'] learn_rate = (10.0**np.arange(-4,-1)).tolist() momentum = np.arange(.5,.9,.1).tolist() neurons = [50, 100, 200] dropout_W = [.1, .2, .5] dropout_U = [.1, .2, .5] W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] init_mode = ['uniform', 'normal', 'zero'] #activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] param_grid = dict(top_words=top_words, max_length = max_review_length, embedding_length = embedding_length, batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode) d_param_grid = dict(input_shape = [(max_review_length[0], embedding_length[0])], batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode) lr_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'penalty':('l1','l2')} sv_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'kernel':('linear', 'poly', 'rbf', 'sigmoid')} rf_params = {'criterion': ['gini', 'entropy']} #setup GridSearch w/ cross validation cnn_grid = GridSearchCV(estimator=cnn, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) lstm_grid = GridSearchCV(estimator=lstm, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) d_cnn_grid = GridSearchCV(estimator=d_cnn, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) d_lstm_grid = GridSearchCV(estimator=d_lstm, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) classics = GridSearchCV(estimator = (LR, SVM, RF), param_grid = (lr_params, sv_params, rf_params), scoring = 'roc_auc', sv = 5, n_jobs = -1) #lr_grid = GridSearchCV(estimator = lr_params, param_grid = lr_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) #sv_grid = GridSearchCV(estimator = sv_params, param_grid = sv_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) #rf_grid = GridSearchCV(estimator = rf_params, param_grid = rf_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) # Fit the model cnn_result = cnn_grid.fit(X_train, Y_train) lstm_result = lstm_grid.fit(X_train, Y_train) d_cnn_result = d_cnn_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) d_lstm_result = d_lstm_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) classics_result = classics.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[1], Y_train) #lr_result = lr_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #sv_result = sv_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #rf_result = rf_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #grid_search results: print("CNN Best: %f using %s" % (cnn_result.best_score_, cnn_result.best_params_)) means = cnn_result.cv_results_['mean_test_score'] stds = cnn_result.cv_results_['std_test_score'] params = cnn_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("LSTM Best: %f using %s" % (lstm_result.best_score_, lstm_result.best_params_)) means = lstm_result.cv_results_['mean_test_score'] stds = lstm_result.cv_results_['std_test_score'] params = lstm_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Decay CNN Best: %f using %s" % (d_cnn_result.best_score_, d_cnn_result.best_params_)) means = d_cnn_result.cv_results_['mean_test_score'] stds = d_cnn_result.cv_results_['std_test_score'] params = d_cnn_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Decay LSTM Best: %f using %s" % (d_lstm_result.best_score_, d_lstm_result.best_params_)) means = d_lstm_result.cv_results_['mean_test_score'] stds = d_lstm_result.cv_results_['std_test_score'] params = d_lstm_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Best of Classics: %f using %s, %s" % (classics_result.best_score_, classics_result.best_estimator_, classics_result.best_params_)) means = classics_result.cv_results_['mean_test_score'] stds = classics_result.cv_results_['std_test_score'] params = classics_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) #KFold = 5 #kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7) #cvscores = [] #for training, testing in kfold.split(X_train, Y_train): # Fit the model #model.fit(X[training], Y[training], nb_epoch=150, batch_size=10, verbose=0) # evaluate the model #scores = model.evaluate(X[testing], Y[testing], verbose=0) #print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) #cvscores.append(scores[1] * 100) #print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) ######TESTING####### cnn = cnn_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length) lstm = lstm_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length) cnn.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1) lstm.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1) #testing predictions_lstm = lstm.predict_classes(X_test) predictions_cnn = cnn.predict_classes(X_test) acc = accuracy_score(Y_test, predictions_lstm) f1 = f1_score (Y_test, predictions_lstm) auc = roc_auc_score (Y_test, predictions_lstm) scores_lstm = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)] acc = accuracy_score(Y_test, predictions_cnn) f1 = f1_score (Y_test, predictions_cnn) auc = roc_auc_score (Y_test, predictions_cnn) scores_cnn = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)] print ("LSTM DATA: ") for s in scores_lstm: print("%s: %.2f" %(s[0], s[1]), end = " ") print ("") print ("CNN DATA: ") for s in scores_cnn: print("%s: %.2f" %(s[0], s[1]), end = " ") data.append(data) return (Data)
model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train_Perf_based, Y_train_Perf_based, epochs=100, batch_size=20, validation_split=0.3, verbose=1) model.test_on_batch(X_test_Perf_based, Y_test_Perf_based, sample_weight=None) pred_Perf_based = model.predict_classes(X_test_Perf_based, verbose=1) print(confusion_matrix(Y_test_Perf_based, pred_Perf_based)) print classification_report(Y_test_Perf_based, pred_Perf_based) print(accuracy_score(Y_test_Perf_based, pred_Perf_based)) fpr_Perf_based, tpr_Perf_based, thresholds_Perf_based = roc_curve( Y_test_Perf_based, pred_Perf_based) auc_keras = auc(fpr_Perf_based, tpr_Perf_based) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_Perf_based, tpr_Perf_based, label='Performance Based Model (area = {:.3f})'.format(auc_keras)) plt.xlabel('False positive rate')
for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) else: model = create_model() # model.fit(train_data, train_label, batch_size=20, epochs=100, shuffle=True, verbose=1, validation_split=0.2) model.fit(train_data, train_label, batch_size=10, epochs=150, shuffle=True, verbose=1, validation_split=0.2) result = model.evaluate(test_data, test_label, batch_size=1000) print('loss:%5.6f acct:%5.6f' % (result[0], result[1])) # test_data = np.array([binary_encode(i) for i in range(1, 101)]) # pred=model.predict(test_data) pred = model.predict_classes(test_data) # init_lables = lb.inverse_transformnsform(pred) # print(init_lables) # Convert the one-hot-encoded prediction back to a normal letter results = [] for i in range(1, 100): results.append('{}'.format(['fizzbuzz', 'fizz', 'buzz', i][pred[i - 1]])) print(', '.join(results))