def do_kfolds(x, y): random.seed(seed) kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) accuracies = [] fscores = [] precisions = [] recalls = [] mccs = [] x = np.array(x) y = np.array(y) print("Preparing training and label data...OK") print("") for train, test in kfold.split(x, y): x_train = x[train] y_train = y[train] x_test = x[test] y_test = y[test] mean = np.mean(x_train, axis = 0) std = np.std(x_train, axis = 0) x_train -= mean eps = 10**-5 std = std + eps x_train /= std x_test -= mean x_test /= std print("Training model on data...") s_training = time.time() M = trainer.build_sequential_model(rate = 0.3, shape = x_train.shape[1]) trained_M = trainer.fit_model_batch(M, x_train, y_train, num_epoch=2000) print("Classifying data...") s_classify = time.time() #scores = trained_M.(x_test) classes = trained_M.predict_classes(x_test) classes = np.array(classes) classes = classes.ravel() e_classify = time.time() print("Classifying data...OK, took: " + str((e_classify - s_classify))) mcc, accuracy, fscore, precision, recall = get_performance_vals(y_test, classes) mccs.append(mcc) accuracies.append(accuracy) fscores.append(fscore) precisions.append(precision) recalls.append(recall) print("MCC: %.2f (+/- %.2f)" % (np.mean(mccs), np.std(mccs))) print("Accuracy: %.2f%% (+/- %.2f%%)" % (100*np.mean(accuracies), 100*np.std(accuracies))) print("F1 score: %.2f (+/- %.2f)" % (np.mean(fscores), np.std(fscores))) print("Precision: %.2f (+/- %.2f)" % (np.mean(precisions), np.std(precisions))) print("Recall: %.2f (+/- %.2f)" % (np.mean(recalls), np.std(recalls))) return trained_M, mean, std
def main(model=None): if choice == 6: if already_extracted == 0: extract_descriptors_from_file_to_pickle(predict_pos_input_name, predict_pos_name) extract_descriptors_from_file_to_pickle(predict_neg_input_name, predict_neg_name) pos_dvec = IO.deserialize_descriptor_vector(predict_pos_name) neg_dvec = IO.deserialize_descriptor_vector(predict_neg_name) if choice != 6: if already_extracted == 0: pos_samples = extract_descriptors_from_file_to_pickle( pos_input_name, pos_name) if use_random_small_sequence_negative == 0: extract_descriptors_from_file_to_pickle( neg_input_name, neg_name, pos_samples) if choice == 1 or choice == 4 or choice == 5: extract_descriptors_from_file_to_pickle("Insert_name", postest) extract_descriptors_from_file_to_pickle("insert_name", negtest) print("Deserializing descriptor vectors...") pos_dvec = IO.deserialize_descriptor_vector(pos_name) if use_random_small_sequence_negative != 0: neg_dvec = IO.deserialize_descriptor_vector( "neg_pipeline_complete_anticancer") #same as neg_cytotoxic if len(neg_dvec) >= len(pos_dvec): neg_dvec = neg_dvec[:len(pos_dvec)] else: print( "Set use_random_small_sequence_negative to zero, because that pickle file does not contain enough samples to maintain alanced classes! Use CTRL-C to quit!" ) input() else: neg_dvec = IO.deserialize_descriptor_vector(neg_name) if len(neg_dvec) != len(pos_dvec): print( "Warning! Class balance is no achieved! Increase negative dataset sampling! Use CTRL-C to quit!" ) print("Negative dataset length: %d" % (len(neg_dvec))) print("Positive dataset length: %d" % (len(pos_dvec))) input() if choice == 1 or choice == 4 or choice == 5: pos_dvec_test = IO.deserialize_descriptor_vector(postest) neg_dvec_test = IO.deserialize_descriptor_vector(negtest) print("Deserializing descriptor vectors...OK") print("") print("Extracting numerical vectors...") # maybe save these too separately #'''Choosing to train only with certain features''' #mask = {'T', 'P', 'G', 'D', 'Q', 'C', 'E', 'M', 'K'} #pos_dvec = [{key: dvec[key] for key in dvec.keys() & mask} for dvec in pos_dvec] #neg_dvec = [{key: dvec[key] for key in dvec.keys() & mask} for dvec in neg_dvec] pos_nmat = [] for dvec in pos_dvec: if dvec is None: continue pos_nvec = FX.num_vector_from_descriptor_vector(dvec) pos_nmat.append(pos_nvec) neg_nmat = [] for dvec in neg_dvec: if dvec is None: continue neg_nvec = FX.num_vector_from_descriptor_vector(dvec) neg_nmat.append(neg_nvec) if choice == 1 or choice == 4: pos_nmat_test = [] for dvec in pos_dvec_test: if dvec is None: continue pos_nvec_test = FX.num_vector_from_descriptor_vector(dvec) pos_nmat_test.append(pos_nvec_test) neg_nmat_test = [] for dvec in neg_dvec_test: if dvec is None: continue neg_nvec_test = FX.num_vector_from_descriptor_vector(dvec) neg_nmat_test.append(neg_nvec_test) print("Extracting numerical vectors...OK") print("") print("Preparing training and label data...") # Prepare labels pos_y_batch = [1 for _ in pos_nmat] neg_y_batch = [0 for _ in neg_nmat] if choice == 1 or choice == 4 or choice == 5: pos_y_batch_test = [1 for _ in pos_nmat_test] neg_y_batch_test = [0 for _ in neg_nmat_test] # Append training data and labels, shuffle is done is kfolds neg_nmat.extend(pos_nmat) x = neg_nmat neg_y_batch.extend(pos_y_batch) y = neg_y_batch if choice == 1 or choice == 4 or choice == 5: neg_nmat_test.extend(pos_nmat_test) x_test = neg_nmat_test neg_y_batch_test.extend(pos_y_batch_test) y_test = neg_y_batch_test if choice == 4 or choice == 5: x.extend(x_test) y.extend(y_test) if choice == 4: trained_M, mean, std = do_kfolds(x, y) if choice == 5: trained_M, mean, std = do_crossval(x, y) print("Preparing training and label data...OK") print("") if choice == 6: #json_file = open('./models/' + model_name + '.json', 'r') json_file = open('./' + model_name + '.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model #loaded_model.load_weights("./models/" + model_name + ".h5") loaded_model.load_weights("./" + model_name + ".h5") print("Loaded model from disk") optim = Adam(lr=0.01, beta_1=0.95) loaded_model.compile(loss='binary_crossentropy', optimizer=optim, metrics=['accuracy']) #path = path = "C:/Users/Peter/Desktop/MIT/code/models/" + std_name + ".pickle" path = path = "./" + std_name + ".pickle" array_file = open(path, 'rb') std = pickle.load(array_file) #path = path = "C:/Users/Peter/Desktop/MIT/code/models/" + mean_name + ".pickle" path = path = "./" + mean_name + ".pickle" array_file = open(path, 'rb') mean = pickle.load(array_file) x -= mean x /= std result = loaded_model.predict(x) print("Probabilities:") print(result) classes = loaded_model.predict_classes(x) classes = np.array(classes) classes = classes.ravel() print("Calsses:") print(classes) if known_classes == 1: get_performance_vals(y, classes) # 10-folds cross-validation if choice == 0: if old_dataset == 1: no_features = 114 #MODIFY THIS ACCORDINGLY x = np.array([np.array(xi).T for xi in x]) remain = x.shape[0] num = [] for i in range(remain): if x[i].shape[0] != no_features: num.extend([i]) print(len(num)) x = np.delete(x, num, 0) y = np.delete(y, num, 0) x = np.array([np.array(xi).T for xi in x ]) #needed to be done again for some reason print(x.size) remain2 = x.shape[0] x.reshape(remain2, no_features) trained_M, mean, std = do_kfolds(x, y) if choice == 1: x = np.array(x) y = np.array(y) x_train = x y_train = y mean = np.mean(x_train, axis=0) std = np.std(x_train, axis=0) x_train -= mean eps = 10**-5 std = std + eps x_train /= std x_test -= mean x_test /= std print("Training model on data...") s_training = time.time() M = trainer.build_sequential_model(rate=0.3, shape=x_train.shape[1]) trained_M = trainer.fit_model_batch(M, x_train, y_train, num_epoch=2000) e_training = time.time() print("Training model on data...OK, took: " + str((e_training - s_training))) print("Classifying data...") s_classify = time.time() #scores = trained_M.predict_with_model(x_test) classes = trained_M.predict_classes(x_test) classes = np.array(classes) classes = classes.ravel() e_classify = time.time() print("Classifying data...OK, took: " + str((e_classify - s_classify))) mcc, accuracy, fscore, precision, recall = get_performance_vals( y_test, classes) if choice == 2: trained_M, mean, std = do_crossval(x, y) if choice == 3: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=seed, stratify=y) mean = np.mean(x_train, axis=0) std = np.std(x_train, axis=0) x_train -= mean eps = 10**-5 std = std + eps x_train /= std x_test -= mean x_test /= std print("Training model on data...") s_training = time.time() M = trainer.build_sequential_model(rate=0.3, shape=x_train.shape[1]) trained_M = trainer.fit_model_batch(M, x_train, y_train, num_epoch=2000) print("Classifying data...") s_classify = time.time() #scores = trained_M.(x_test) classes = trained_M.predict_classes(x_test) classes = np.array(classes) classes = classes.ravel() e_classify = time.time() print("Classifying data...OK, took: " + str((e_classify - s_classify))) mcc, accuracy, fscore, precision, recall = get_performance_vals( y_test, classes) if choice != 6: #path = "C:/Users/Peter/Desktop/MIT/code/models/" + std_name + ".pickle" path = "./" + std_name + ".pickle" output = open(path, 'w+b') pickle.dump(std, output) output.close() #path = "C:/Users/Peter/Desktop/MIT/code/models/" + mean_name + ".pickle" path = "./" + mean_name + ".pickle" output = open(path, 'w+b') pickle.dump(mean, output) output.close() # serialize model to JSON model_json = trained_M.to_json() #with open("./models/" + model_name + ".json", "w") as json_file: with open("./" + model_name + ".json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 #trained_M.save_weights("./models/" + model_name + ".h5") trained_M.save_weights("./" + model_name + ".h5") print("Saved model to disk")
def do_crossval(x, y): mccs = [] rate_arr = [0.2, 0.3, 0.4, 0.5, 0.6] #dropout rate x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=seed, stratify=y) x_train, x_validate, y_train, y_validate = train_test_split( x, y, test_size=0.25, train_size=0.75, random_state=seed, stratify=y) mean = np.mean(x_train, axis=0) std = np.std(x_train, axis=0) x_train -= mean eps = 10**-5 std = std + eps x_train /= std x_validate -= mean x_validate /= std x_test -= mean x_test /= std for i in rate_arr: print("Training model on data...") s_training = time.time() M = trainer.build_sequential_model(rate=i, shape=x_train.shape[1]) trained_M = trainer.fit_model_batch(M, x_train, y_train, num_epoch=2000) #set high to 500 e_training = time.time() print("Training model on data...OK, took: " + str((e_training - s_training))) print("Classifying data...") s_classify = time.time() #scores = trained_M.predict(x_validate) classes = trained_M.predict_classes(x_validate) classes = np.array(classes) classes = classes.ravel() e_classify = time.time() print("Classifying data...OK, took: " + str((e_classify - s_classify))) mcc, accuracy, fscore, precision, recall = get_performance_vals( y_validate, classes) mccs.append(mcc) idx = np.argmax(mccs) best_rate = rate_arr[idx] print("Best dropout rate is %f" % (best_rate)) print("Training model on data...") s_training = time.time() M = trainer.build_sequential_model(rate=best_rate, shape=x_train.shape[1]) trained_M = trainer.fit_model_batch(M, x_train, y_train, num_epoch=2000) e_training = time.time() print("Training model on data...OK, took: " + str((e_training - s_training))) print("Classifying data...") s_classify = time.time() #scores = trained_M.predict(x_test) classes = trained_M.predict_classes(x_test) classes = np.array(classes) classes = classes.ravel() e_classify = time.time() print("Classifying data...OK, took: " + str((e_classify - s_classify))) print("Best dropout rate is %f" % (best_rate)) mcc, accuracy, fscore, precision, recall = get_performance_vals( y_test, classes) return trained_M, mean, std