def cv(db, csv_target, csv_descriptors, n_splits_, n_repeats_, num_epochs, n_rot_train, train_steps_per_epoch_, n_rot_test, test_steps_per_epoch_, ndense_layers, nunits, nfilters, random_state, cvout=None, fcvgroup=None, featimp_out=None, y_recalc=False, mout=None): # Load the dataset ai = AIModel(csv_target, db, csv_descriptors) available_keys = ai.GetAvailableKeys() print("N. instances: %d" % (len(ai.target))) predictions = dict() valpredictions = dict() for key in ai.target.keys(): predictions[key] = [] valpredictions[key] = [] feat_imp = None feat_imp_iterations = 20 if featimp_out is not None: # feature importance list for csv descriptors if ai.other_descriptors is not None: feat_imp = [[] for p in range(ai.nfeatures)] # charge voxel descriptor feat_imp.append([]) else: print("Feature Importance calculation: DISABLED") # Create directory to store all the models mout_path = None if mout is not None: # Utilised to store the out path mout_path = Path("%s_%s" % (time.strftime("%Y%m%d%H%M%S"), mout)) mout_path.mkdir(exist_ok=True, parents=True) if ai.other_descriptors is not None: # Save the descriptor order f = open("%s/odesc_header.csv" % (str(mout_path.absolute())), "w") for item in ai.header: f.write("%s\n" % (item)) f.close() # Choose between static manual cross validation group or # Repeated KFold Cross Validation cvmethod = None cvgroups = None if fcvgroup is not None: cvgroups = CVGroupRead(fcvgroup) cvmethod = StaticGroupCV(cvgroups) # cvmethod = RepeatedStratifiedCV(cvgroups, n_repeats_, 2) else: cvmethod = RepeatedKFold(available_keys, n_splits_, n_repeats_, random_state, test_size=0.2) cv_ = 0 for train_keys, val_keys, test_keys in cvmethod: print("Train set size: %d Val set size %d Test set size: %d" % (len(train_keys), len(val_keys), len(test_keys))) # Some memory clean-up K.clear_session() # print(global_test_intexes) model = None model_ = GetKerasModel() if ai.other_descriptors is None: if model_ is None: model = build_model(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters) else: model = model_(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters) # model = model_scirep(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters) # model = ResNetModel(ai.input_shape) print(model.summary()) else: if model_ is None: model = build_2DData_model(ai.conv3d_chtype, ai.input_shape, ai.nfeatures, ndense_layers, nunits, nfilters) else: model = model_(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters) """ for l in model.layers[0].layers: print(l.summary()) """ print("Total Summary") print(model.summary()) dname = os.path.basename(csv_target).replace(".csv", "") log_dir_ = ("./logs/cv%d_%s_%d_#rot%d_#f%d_#dl%d_#u%d_" % (cv_, dname, num_epochs, train_steps_per_epoch_, nfilters, ndense_layers, nunits)) log_dir_ += time.strftime("%Y%m%d%H%M%S") model_outfile = "%s/%d.h5" % (str(mout_path.absolute()), cv_) callbacks_ = [ TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False), ModelCheckpoint(model_outfile, monitor='val_loss', verbose=0, save_best_only=True) ] train_generator = ai.VoxelTrainGenerator(train_keys, n_rot_train) x_train_, y_train_ = ai.VoxelTestSetGenerator(train_keys, n_rot_train) x_test_, y_test_ = ai.VoxelTestSetGenerator(test_keys, n_rot_test) x_val_, y_val_ = ai.VoxelTestSetGenerator(val_keys, n_rot_test) val_generator = ai.VoxelTrainGenerator(val_keys, n_rot_test) model.fit_generator( train_generator, epochs=num_epochs, steps_per_epoch=train_steps_per_epoch_, verbose=1, # validation_data=(x_test_, y_test_), validation_data=val_generator, validation_steps=test_steps_per_epoch_, callbacks=callbacks_, use_multiprocessing=True) """ if y_recalc is True: # Recalculate y it takes a lot of time x_dataset_, y_dataset_ = ai.VoxelTestSetGenerator(train_keys, n_rotation_test) yrecalc = model.predict(x_dataset_) # Store the recalculated y k = 0 c = 0 for i in range(len(yrecalc)): recalc[train_keys[k]].extend(list(yrecalc[i])) if c == n_rotation_test-1: k += 1 c = 0 else: c += 1 """ """ test_scores = model.evaluate(x_test_, y_test_) print("Test Scores: {}".format(test_scores)) """ model = GetLoadModelFnc()(model_outfile) y_recalc = model.predict(x_train_) ypred_test = model.predict(x_test_) ypred_val = model.predict(x_val_) # exp_pred_plot(y_test_, ypred_test[:,0]) r2 = RSQ(y_train_, y_recalc) q2 = RSQ(y_test_, ypred_test) vr2 = RSQ(y_val_, ypred_val) print("Train R2: %.4f Test Q2: %.4f Val: R2: %.4f\n" % (r2, q2, vr2)) # Store the test prediction result k = 0 c = 0 for i in range(len(ypred_val)): valpredictions[test_keys[k]].append(list(ypred_val[i])) if c == n_rot_test - 1: k += 1 c = 0 else: c += 1 # Store the cross validation result k = 0 c = 0 for i in range(len(ypred_test)): predictions[test_keys[k]].append(list(ypred_test[i])) if c == n_rot_test - 1: k += 1 c = 0 else: c += 1 """ Compute the feature importance according to the Breiman-Fisher-Rudin-Dominici-Algorithm Train a model f with a feature map X and a target vector y. Measure th error L(y, y_pred) = e_original Input: trained model f, feature matrix X, target vector y, error measure L(y, y_pred) 1) Estimate the original model error 2) For each feature: - Generate a feature matrix with the p feature permutated N times to breaks the association between Xj and y - estimate the error using the permutated X feature matrix - calculate the feature importance FI = e_perm/e_original or FI = e_perm - e_original 3) Sort variables by descending Fi The error estimation utilised is the mean squared error calculated with this formula mse = (np.square(A - B)).mean(axis=0) """ if feat_imp is not None: # e_orig = MSE(list(y_test_), list(ypred)) e_orig = MAE(list(y_test_), list(ypred_test)) # calculate the feature importance for the descriptors for fid_ in range(ai.nfeatures): for it in range(feat_imp_iterations): x_val_perm = ai.FeaturePermutation(x_val_, fid=fid_) ypred_perm = model.predict(x_val_perm) # e_perm = MSE(list(y_test_), list(ypred_perm)) e_perm = MAE(list(y_test_), list(ypred_perm)) feat_imp[fid_].append(e_perm / e_orig) # Calculate the feature importance for the voxel information for it in range(feat_imp_iterations): x_val_perm = ai.FeaturePermutation(x_val_, fid=9999) ypred_perm = model.predict(x_val_perm) e_perm = MAE(list(y_test_), list(ypred_perm)) feat_imp[-1].append(e_perm / e_orig) if mout_path is not None: model.save("%s/%d.h5" % (str(mout_path.absolute()), cv_)) # Update the cross validation id cv_ += 1 if cvout is not None: WriteCrossValidationOutput(cvout, self.target, predictions, None) if feat_imp is not None: fo = open("%s" % (featimp_out), "w") for i in range(ai.nfeatures): """ fo.write("%s," % (ai.header[i])) for j in range(len(feat_imp[i])-1): fo.write("%.4f," % (feat_imp[i][j])) fo.write("%.4f\n" % (feat_imp[i][-1])) """ a = np.array(feat_imp[i]) min_a = a.min() q1 = np.percentile(a, 25) med_a = np.percentile(a, 50) q3 = np.percentile(a, 75) max_a = a.max() fo.write("%s,%.4f,%.4f,%.4f,%.4f,%.4f\n" % (ai.header[i], min_a, q1, med_a, q3, max_a)) a = np.array(feat_imp[-1]) min_a = a.min() q1 = np.percentile(a, 25) med_a = np.percentile(a, 50) q3 = np.percentile(a, 75) max_a = a.max() fo.write("%s,%.4f,%.4f,%.4f,%.4f,%.4f\n" % ("qm_voxel_charge", min_a, q1, med_a, q3, max_a)) """ fo.write("%s,\n" % ("qm_voxel_charge")) for j in range(len(feat_imp[-1])-1): fo.write("%.4f," % (feat_imp[-1][j])) fo.write("%.4f\n" % (feat_imp[-1][-1])) """ fo.close() ycvp = {} for key in predictions.keys(): if len(predictions[key]) > 0: ycvp[key] = np.mean(predictions[key]) else: continue return ycvp
def runcv(self, batch_size_, batch_mode_, num_epochs, ndense_layers, nunits, cvout, n_splits=5, n_repeats=10, random_state=None, mout=None, fimpfile=None): print("N. instances: %d" % (len(self.target))) mout_path = None if mout is not None: # Utilised to store the out path # mout_path = Path("%s_%s" % (time.strftime("%Y%m%d%H%M%S"), mout)) mout_path = Path(mout) else: # Utilised to store the out path mout_path = Path("%s_model" % (time.strftime("%Y%m%d%H%M%S"))) last_model = None if mout_path.exists() is True: # Find the last model and restart the calculation from it. p = Path(mout_path).glob('**/*.h5') # getonlzfile numbers mids = [int(x.stem) for x in p if x.is_file()] if len(mids) > 0: # Restart from here... last_model = max(mids) else: last_model = None else: mout_path.mkdir() # Save the descriptor order f = open("%s/odesc_header.csv" % (str(mout_path.absolute())), "w") for dname in self.xheader: f.write("%s\n" % (dname)) f.close() feat_imp = {} if fimpfile is not None: for feat_name in self.xheader: feat_imp[feat_name] = {'mae': [], 'mse': []} cv_ = 0 predictions = {} recalc = {} for key in self.target.keys(): predictions[key] = [] recalc[key] = [] valfn = GetValidationFnc() if valfn is None: valfn = RepeatedKFold(list(self.target.keys()), n_splits, n_repeats, random_state=random_state, test_size=0.2) else: print("Using custom validation split function") valfn = valfn(list(self.target.keys())) for train_keys, val_keys, test_keys in valfn: # Some memory clean-up K.clear_session() train_steps_per_epoch = ceil(len(train_keys) / float(batch_size_)) train_generator = self.DataGenerator(train_keys, batch_size_, batch_mode_) # x_train, y_train = self.GenData(train_keys) # test_steps_per_epoch = ceil(len(train_keys)/float(batch_size_)) # test_generator = self.DataGenerator(test_keys, batch_size_) x_test, y_test = self.GenData(test_keys) x_val, y_val = self.GenData(val_keys) print("Train set size: %d Val set size %d Test set size: %d" % (len(train_keys), len(val_keys), len(test_keys))) model_output = "%s/%d.h5" % (str(mout_path.absolute()), cv_) if last_model is None: model = None model_ = GetKerasModel() if model_ is None: model = example_build_model(self.nfeatures, nunits, ndense_layers, self.ntargets) else: model = model_(self.nfeatures, nunits, ndense_layers) print(model.summary()) dname = cvout.replace(".csv", "") b = batch_size_ log_dir_ = ("./logs/cv%d_%s_#b%d_#e%d_#u%d_#dl%d_" % (cv_, dname, b, num_epochs, nunits, ndense_layers)) log_dir_ += time.strftime("%Y%m%d%H%M%S") callbacks_ = [ TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False), ModelCheckpoint(model_output, monitor='val_loss', verbose=0, save_best_only=True) ] model.fit_generator( train_generator, steps_per_epoch=train_steps_per_epoch, epochs=num_epochs, verbose=self.verbose, validation_data=(x_val, y_val), # validation_data=test_generator, # validation_steps=test_steps_per_epoch, callbacks=callbacks_) else: if last_model - 1 == cv_: last_model = None model_ = GetLoadModelFnc()(model_output) y_recalc_train = self.makePrediction(model_, train_keys) y_pred_val = self.makePrediction(model_, val_keys) y_pred_test = self.makePrediction(model_, test_keys) y_recalc = [] y_true_recalc = [] for key in train_keys: y_recalc.append(y_recalc_train[key]) y_true_recalc.append(self.target[key]) recalc[key].append(y_recalc_train[key]) ypred_val = [] ytrue_val = [] for key in val_keys: ypred_val.append(y_pred_val[key]) ytrue_val.append(self.target[key]) ypred_test = [] ytrue_test = [] for key in test_keys: ypred_test.append(y_pred_test[key]) ytrue_test.append(self.target[key]) # Store validation prediction predictions[key].append(y_pred_test[key]) r2 = RSQ(y_true_recalc, y_recalc) q2 = RSQ(ytrue_test, ypred_test) tr2 = RSQ(ytrue_val, ypred_val) print("Train R2: %.4f Test Q2: %.4f Val: R2: %.4f\n" % (r2, q2, tr2)) # Store the cross validation model # if mout_path is not None: # model.save("%s/%d.h5" % (str(mout_path.absolute()), cv_)) if fimpfile is not None: fimp = FeatureImportance(model, x_test, y_test, self.xheader) fires = fimp.Calculate(verbose=1) for key in fires.keys(): feat_imp[key]['mae'].extend(fires[key]['mae']) feat_imp[key]['mse'].extend(fires[key]['mse']) cv_ += 1 WriteCrossValidationOutput(cvout, self.target, predictions, recalc) if fimpfile is not None: WriteFeatureImportance(feat_imp, fimpfile)
def simplerun(db, csv_target, csv_descriptors, num_epochs, n_rot_train, train_steps_per_epoch_, n_rotation_test, test_steps_per_epoch_, ndense_layers, nunits, nfilters, random_state, outmodel=None, fcvgroup=None, tid=None): # Load the dataset ai = AIModel(csv_target, db, csv_descriptors) available_keys = ai.GetAvailableKeys() train_keys = None test_keys = None if fcvgroup is not None: cvgroups = CVGroupRead(fcvgroup) tkey = None if tid is not None: tkey = int(tid) print(tkey) else: tkey = random.choice(list(cvgroups.keys())) print(cvgroups[tkey]) test_keys = cvgroups[tkey] train_keys = [] for key in cvgroups.keys(): if key == tkey: continue else: train_keys.textened(cvgroups[key]) else: ttfn = GetTrainTestFnc() if ttfn is None: ttfn = TrainTestSplit else: print("Using custom train/test split function") train_keys, test_keys = ttfn(list(self.target.keys()), test_size=0.20, random_state=random_state) print("Trainin set size: %d Validation set size %d" % (len(train_keys), len(test_keys))) train_generator = ai.VoxelTrainGenerator(train_keys, n_rot_train) print(train_keys) print(test_keys) model = None model_ = GetKerasModel() if ai.other_descriptors is None: if model_ is None: model = build_model(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters) else: model = model_(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters) # model = build_fcn_model(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters) # model = model_scirep(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters) # model = ResNetModel(ai.input_shape) print(model.summary()) else: if model_ is None: model = build_2DData_model(ai.conv3d_chtype, ai.input_shape, ai.nfeatures, ndense_layers, nunits, nfilters) else: model = model_(ai.conv3d_chtype, ai.input_shape, ai.nfeatures, ndense_layers, nunits, nfilters) """ for l in model.layers[0].layers: print(l.summary()) """ print("Total Summary") print(model.summary()) plot_model(model, to_file="model.png", show_shapes=True) dname = os.path.basename(csv_target).replace(".csv", "") dname += os.path.basename(db) log_dir_ = ("./logs/%s_%d_#rot%d_#f%d_#dl%d_#u%d_" % (dname, num_epochs, train_steps_per_epoch_, nfilters, ndense_layers, nunits)) log_dir_ += time.strftime("%Y%m%d%H%M%S") callbacks_ = [ TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False) ] """ , EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto') """ test_generator = ai.VoxelTrainGenerator(test_keys, n_rot_train) model.fit_generator( train_generator, epochs=num_epochs, steps_per_epoch=train_steps_per_epoch_, verbose=1, # max_queue_size=2, # workers=0, # validation_data=(x_test_, y_test_), validation_data=test_generator, validation_steps=test_steps_per_epoch_, # nb_val_samples=x_test.shape[0], callbacks=callbacks_, use_multiprocessing=True) x_test_, y_test_ = ai.VoxelTestSetGenerator(test_keys, n_rotation_test) y_pred_ = model.predict(x_test_) print("Test R2: %.4f" % (r2_score(y_test_, y_pred_))) fo = open("statconf.csv", "w") for key in ai.statvoxconf.keys(): fo.write("%s," % (key)) for i in range(len(ai.statvoxconf[key])): for j in range(len(ai.statvoxconf[key][i])): fo.write("%d," % (ai.statvoxconf[key][i][j])) fo.write("\n") fo.close() # score = model.evaluate(x_test_, y_test_, verbose=0) # print(score) if outmodel is not None: model.save(outmodel)
def simplerun(self, batch_size_, batch_mode_, num_epochs, ndense_layers, nunits, random_state, model_output=None): """ Run a simple model... """ # train_keys, test_keys = MDCTrainTestSplit(self.target, 0) # train_keys, test_keys = DISCTrainTestSplit(self.target) ttfn = GetTrainTestFnc() if ttfn is None: ttfn = TrainTestSplit else: print("Using custom train/test split function") train_keys, test_keys = ttfn(list(self.target.keys()), test_size=0.20, random_state=random_state) print("Train set size: %d Test set size %d" % (len(train_keys), len(test_keys))) model = None if model_output is not None and Path(model_output).is_file(): model = GetLoadModelFnc()(model_output) else: model_ = GetKerasModel() if model_ is None: model = example_build_model(self.nfeatures, nunits, ndense_layers, self.ntargets) else: model = model_(self.nfeatures, nunits, ndense_layers) print(model.summary()) train_steps_per_epoch = ceil(len(train_keys) / float(batch_size_)) train_generator = self.DataGenerator(train_keys, batch_size_, batch_mode_) #x_train, y_train = self.GenData(train_keys) # This is unstable # test_steps_per_epoch = ceil(len(train_keys)/float(batch_size_)) # test_generator = self.DataGenerator(test_keys, batch_size_) # This is more stable x_test, y_test = self.GenData(test_keys) b = batch_size_ log_dir_ = ("./logs/#b%d_#e%d_#u%d_#dl%d_" % (b, num_epochs, nunits, ndense_layers)) log_dir_ += time.strftime("%Y%m%d%H%M%S") callbacks_ = [ TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False) ] """ model.fit(x_train, y_train, epochs=num_epochs, batch_size=b, verbose=self.verbose, validation_data=(x_test, y_test), callbacks=callbacks_) yrecalc_train = model.predict(x_train) """ model.fit_generator( train_generator, steps_per_epoch=train_steps_per_epoch, epochs=num_epochs, verbose=1, validation_data=(x_test, y_test), # validation_data=test_generator, # validation_steps=test_steps_per_epoch, callbacks=callbacks_) y_recalc_train = self.makePrediction(model, train_keys) y_pred_test = self.makePrediction(model, test_keys) ytrain_recalc = [] ytrain_true = [] for key in train_keys: ytrain_recalc.append(y_recalc_train[key]) ytrain_true.append(self.target[key]) ytest_pred = [] ytest_true = [] for key in test_keys: ytest_pred.append(y_pred_test[key]) ytest_true.append(self.target[key]) print("R2: %.4f Q2: %.4f MSE: %.4f" % (RSQ(ytrain_true, ytrain_recalc), RSQ( ytest_pred, ytest_true), MSE(ytest_pred, ytest_true))) fo = open("%s_pred.csv" % time.strftime("%Y%m%d%H%M%S"), "w") for i in range(len(ytest_true)): fo.write("%s" % (test_keys[i])) for j in range(len(ytest_true[i])): fo.write(",%f,%f" % (ytest_true[i][j], ytest_pred[i][j])) fo.write("\n") fo.close() if model_output is not None: model.save(model_output)
def GridSearch(self, batch_size_, steps_per_epoch_, num_epochs, random_state, gmout="GridSearchResult"): train_keys, test_keys = TrainTestSplit(list(self.target.keys()), test_size=0.20, random_state=random_state) print("Train set size: %d Test set size %d" % (len(train_keys), len(test_keys))) # train_steps_per_epoch = ceil(len(train_keys)/float(batch_size_)) # train_generator = self.DataGenerator(train_keys, batch_size_) x_train, y_train, rtrain_keys = self.GenData(train_keys) # This is unstable # test_steps_per_epoch = ceil(len(train_keys)/float(batch_size_)) # test_generator = self.DataGenerator(test_keys, batch_size_) # This is more stable x_test, y_test, rtest_keys = self.GenData(test_keys) # PARAMETER DEFINITIONS # simple architecture """ param = {} param["nunits"] = [100, 200, 400] param["ndense_layers"] = [2, 4, 6] param["dropout"] = ["on", "off"] #param["activation"] = ["relu", "leakyrelu"] param["activation"] = ["relu"] """ # resnet architecture param = {} param["nunits"] = [200, 400, 600, 800] param["ndense_layers"] = [2, 4, 6, 8] all_combo = list(ParameterGrid(param)) print("Evaluating %d combinations of parameters" % (len(all_combo))) already_computed_combo = [] if Path(gmout).is_file(): fi = open(gmout, "r") for line in fi: v = str.split(line.strip(), " ") """ # simple architecture units = v[0] layers = v[1]tensorflow act = v[2] drop = v[3] s = ("%s-%s-%s-%s" % (units, layers, act, drop)) """ # resnet architecture units = v[0] layers = v[1] s = ("%s-%s" % (units, layers)) already_computed_combo.append(s) fi.close() model_ = GetKerasModel() for c in all_combo: """ # simple architecture s = ("%s-%s-%s-%s" % (c["nunits"], c["ndense_layers"], c["activation"], c["dropout"])) """ # resnet architecture s = ("%s-%s" % (c["nunits"], c["ndense_layers"])) if s in already_computed_combo: print("%s already computed... skip..." % (s)) else: """ model = build_gridsearch_model(self.nfeatures, c["ndense_layers"], c["nunits"], c["activation"], c["dropout"]) """ if model_ is None: model = example_build_model(self.nfeatures, c["nunits"], c["ndense_layers"]) else: model = model_(self.nfeatures, c["nunits"], c["ndense_layers"]) """ model = build_dnn_resnet_model(self.nfeatures, c["nunits"], c["ndense_layers"]) """ print(model.summary()) b = batch_size_ """ model_name = ("#b%d_#e%d_#u%d_#dl%d_act-%s_dp-%s" % (b, num_epochs, c["nunits"], c["ndense_layers"], c["activation"], c["dropout"])) """ model_name = ("#b%d_#e%d_#u%d_#dl%d" % (b, num_epochs, c["nunits"], c["ndense_layers"])) log_dir_ = ("./logs/%s" % (model_name)) log_dir_ += time.strftime("%Y%m%d%H%M%S") model_output = "%s.h5" % (model_name) callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False), ModelCheckpoint(model_output, monitor='val_loss', verbose=0, save_best_only=True)] """ callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False), EarlyStopping(monitor='val_loss', min_delta=0, patience=50, verbose=0, mode='auto')] """ model.fit(x_train, y_train, epochs=num_epochs, batch_size=b, steps_per_epoch=steps_per_epoch_, verbose=self.verbose, validation_data=(x_test, y_test), callbacks=callbacks_) bestmodel = load_model(model_output, custom_objects={"score": score}) yrecalc_train = bestmodel.predict(x_train) """ model.fit_generator(train_generator, steps_per_epoch=train_steps_per_epoch, epochs=num_epochs, verbose=1, validation_data=(x_test, y_test), # validation_data=test_generator, # validation_steps=test_steps_per_epoch, callbacks=callbacks_) yrecalc_train = [] y_train = [] for key in train_keys: a = np.array([self.X_raw[key]]) yrecalc_train.extend(model.predict(a)) y_train.append(self.target[key]) """ ypred_test = bestmodel.predict(x_test) r2 = r2_score(y_train, yrecalc_train) mse_train = mse(y_train, yrecalc_train) mae_train = mae(y_train, yrecalc_train) q2 = r2_score(y_test, ypred_test) mse_test = mse(y_test, ypred_test) mae_test = mae(y_test, ypred_test) train_score = LOGMAE(y_train, yrecalc_train) test_score = LOGMAE(y_test, ypred_test) print("R2: %.4f Train Score: %f Q2: %.4f Test Score: %f" % (r2, train_score, q2, test_score)) fo = open("%s" % (gmout), "a") """ # simple architecture fo.write("%d %d %s %s %f %f %f %f %f %f %f %f\n" % (c["nunits"], c["ndense_layers"], c["activation"], c["dropout"], mse_train, mae_train, r2, train_score, mse_test, mae_test, q2, test_score)) """ # resnet architecture fo.write("%d %d %f %f %f %f %f %f %f %f\n" % (c["nunits"], c["ndense_layers"], mse_train, mae_train, r2, train_score, mse_test, mae_test, q2, test_score)) fo.close()
def runloo(self, batch_size_, num_epochs, ndense_layers, nunits, cvout): print("N. instances: %d" % (len(self.target))) predictions = dict() for val_key in self.target.keys(): sub_target = {} for key in self.target.keys(): if val_key == key: continue else: sub_target[key] = self.target[key] # train_keys.append(key) print("Validating %s" % (val_key)) # train_keys, test_keys = MDCTrainTestSplit(sub_target, 0) train_keys, test_keys = TrainTestSplit(sub_target, test_size=0.20) x_train, y_train, rtrain_keys = self.GenData(train_keys) x_test, y_test, rtest_keys = self.GenData(test_keys) model = None model_ = GetKerasModel() if model_ is None: model = example_build_model(self.nfeatures, nunits, ndense_layers) else: model = model_(self.nfeatures, nunits, ndense_layers) print(model.summary()) b = 0 if batch_size_ is None: b = len(x_test) else: b = batch_size_ log_dir_ = ("./logs/%s_#b%d_#e%d_#u%d_#dl%d_" % (val_key, b, num_epochs, nunits, ndense_layers)) log_dir_ += time.strftime("%Y%m%d%H%M%S") callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False)] """ callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False), EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')] """ model.fit(x_train, y_train, epochs=num_epochs, batch_size=b, verbose=1, validation_data=(x_test, y_test), callbacks=callbacks_) predictions[val_key] = model.predict(x_test)[0] fo = open(cvout, "w") for key in predictions.keys(): fo.write("%s,%.4f,%.4f\n" % (key, self.target[key], predictions[key])) fo.close()
def runcv(self, batch_size_, num_epochs, steps_per_epoch_, nfilters, nunits, random_state, cvout, n_splits=5, n_repeats=10, mout=None): print("N. instances: %d" % (len(self.target))) mout_path = None if mout is not None: # Utilised to store the out path mout_path = Path("%s_%s" % (time.strftime("%Y%m%d%H%M%S"), mout)) mout_path.mkdir() # Save the descriptor order """ f = open("%s/odesc_header.csv" % (str(mout_path.absolute())), "w") for name in self.xheader: f.write("%s\n" % (name)) f.close() """ cv_ = 0 predictions = {} recalc = {} for key in self.target.keys(): # N.B.: each molecule can have multiple outputs. predictions[key] = [] recalc[key] = [] for train_keys, val_keys, test_keys in RepeatedKFold(list(self.target.keys()), n_splits, n_repeats, random_state=random_state, test_size=0.2): print("Train set size: %d Val set size %d Test set size: %d" % (len(train_keys), len(val_keys), len(test_keys))) x_train, y_train, rtrain_keys = self.GenData(train_keys) x_val, y_val, rval_keys = self.GenData(val_keys) x_test, y_test, rtest_keys = self.GenData(test_keys) # Some memory clean-up K.clear_session() model = None model_ = GetKerasModel() if self.dx is not None: print("Number of descriptors: %d" % (self.n_descs)) if model_ is None: model = example_build_2DData_model(self.db.input_shape, self.n_descs, nfilters, nunits) else: model = model_(self.db.input_shape, self.n_descs, nfilters, nunits) else: if model_ is None: model = example_build_model(self.db.input_shape, nfilters, nunits) else: model = model_(self.db.input_shape, nfilters, nunits) print(model.summary()) dname = cvout.replace(".csv", "") b = 0 if batch_size_ is None: b = len(x_val) else: b = batch_size_ name = "cv%d_%s_#b%d_#e%d_#u%d_#f%d_" % (cv_, dname, b, num_epochs, nunits, nfilters) name += time.strftime("%Y%m%d%H%M%S") log_dir_ = ("./logs/%s" % (name)) model_output = None if mout_path is not None: model_output = "%s/%d.h5" % (str(mout_path.absolute()), cv_) if model_output is None: callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False)] else: callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False), ModelCheckpoint(model_output, monitor='val_loss', verbose=0, save_best_only=True)] train_generator = self.DataGenerator(train_keys, batch_size_) model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch_, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val), # validation_data=test_generator, # validation_steps=test_steps_per_epoch, callbacks=callbacks_, use_multiprocessing=True) """ model.fit(x_train, y_train, epochs=num_epochs, batch_size=b, steps_per_epoch=steps_per_epoch_, verbose=1, validation_data=(x_val, y_val), callbacks=callbacks_) """ # WARNING Implement cross validation results for multiple outputs bestmodel = load_model(model_output, custom_objects={"score": score}) yrecalc = bestmodel.predict(x_train) for i in range(len(yrecalc)): recalc[train_keys[i]].append(list(yrecalc[i])) ypred_val = bestmodel.predict(x_val) print("Test R2: %.4f" % (r2_score(y_val, ypred_val))) ypred_test = bestmodel.predict(x_test) # exp_pred_plot(y_val_, ypred[:,0]) print("Validation R2: %.4f" % (r2_score(y_test, ypred_test))) for i in range(len(ypred_test)): predictions[test_keys[i]].append(list(ypred_test[i])) """ if fimpfile is not None: fimp = FeatureImportance(model, x_val, y_val, self.xheader) fires = fimp.Calculate(verbose=1) for key in fires.keys(): feat_imp[key]['mae'].extend(fires[key]['mae']) feat_imp[key]['mse'].extend(fires[key]['mse']) """ cv_ += 1 WriteCrossValidationOutput(cvout, self.target, predictions, recalc)
def simplerun(self, batch_size_, num_epochs, steps_per_epoch_, nfilters, nunits, random_state, mout=None): print("N. instances: %d" % (len(self.target))) ttfn = GetTrainTestFnc() if ttfn is None: ttfn = TrainTestSplit else: print("Using custom train/test split function") train_keys, test_keys = ttfn(list(self.target.keys()), test_size=0.20, random_state=random_state) print("Train set size: %d Test set size %d" % (len(train_keys), len(test_keys))) model = None model_ = GetKerasModel() if self.dx is not None: print("Number of descriptors: %d" % (self.n_descs)) if model_ is None: model = example_build_2DData_model(self.db.input_shape, self.n_descs, nfilters, nunits) else: model = model_(self.db.input_shape, self.n_descs, nfilters, nunits) else: if model_ is None: model = example_build_model(self.db.input_shape, nfilters, nunits) else: model = model_(self.db.input_shape, nfilters, nunits) print(model.summary()) x_train, y_train, rtrain_keys = self.GenData(train_keys) x_test, y_test, rtest_keys = self.GenData(test_keys) if self.dx is not None: print("Branch 1 size:", np.array(x_train[0]).shape) print("Branch 2 size:", np.array(x_train[1]).shape) else: print(x_train.shape) print(y_train.shape) b = 0 if batch_size_ is None: b = len(x_test) else: b = batch_size_ name = "#b%d_#e%d_#u%d_#f%d_" % (b, num_epochs, nunits, nfilters) name += time.strftime("%Y%m%d%H%M%S") log_dir_ = ("./logs/%s" % (name)) callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False)] """ callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, wx_trainrite_graph=False, write_images=False), EarlyStopping(monitor='val_loss', min_delta=0, patience=50, verbose=0, mode='auto')] """ #train_steps_per_epoch = int(np.ceil(len(train_keys)/float(batch_size_))) train_generator = self.DataGenerator(train_keys, batch_size_) model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch_, epochs=num_epochs, verbose=1, validation_data=(x_test, y_test), # validation_data=test_generator, # validation_steps=test_steps_per_epoch, callbacks=callbacks_, use_multiprocessing=True) """ model.fit(x_train, y_train, epochs=num_epochs, batch_size=b, verbose=self.verbose, validation_data=(x_test, y_test), callbacks=callbacks_) """ yrecalc = model.predict(x_train) ypred_test = model.predict(x_test) fo = open("%s_pred.csv" % (name), "w") if ypred_test.shape[1] > 1: for i in range(len(rtest_keys)): fo.write("%s," % (rtest_keys[i])) for j in range(len(y_test[i])-1): fo.write("%f,%f," % (y_test[i][j], ypred_test[i][j])) fo.write("%f,%f\n" % (y_test[i][-1], ypred_test[i][-1])) fo.close() # Then calculate R2 and Q2 for each output... for j in range(ypred_test.shape[1]): y_train_ = [] yrecalc_ = [] y_test_ = [] ypred_test_ = [] for i in range(ypred_test.shape[0]): y_train_.append(y_train[i][j]) yrecalc_.append(yrecalc[i][j]) y_test_.append(y_test[i][j]) ypred_test_.append(ypred_test[i][j]) print("Output %d R2: %.4f Q2: %.4f" % (j, r2_score(y_train_, yrecalc_), r2_score(y_test_, ypred_test_))) else: for i in range(len(rtest_keys)): fo.write("%s," % (rtest_keys[i])) for j in range(len(y_test[i])): fo.write("%f,%f" % (y_test[i], ypred_test[i])) fo.write("\n") fo.close() print("R2: %.4f Q2: %.4f" % (r2_score(y_train, yrecalc), r2_score(y_test, ypred_test)))