def main_train(pos_sequences=None, neg_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]]*len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]]*len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def main_train(pos_sequences=None, neg_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]] * len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]] * len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def main_train(pos_sequences=None, neg_sequences=None, prefix=None, model_file=None, weights_file=None): # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]]*len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]]*len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if model_file is not None and weights_file is not None: # load model print("loading model...") model = SequenceDNN.load(model_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1]) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save("%s.model.json" % (prefix), "%s.weights.hd5" % (prefix)) print("Done!")
def run(use_deep_CNN, use_RNN, label, golden_first_sequence, golden_results): seq_length = 100 num_sequences = 200 num_positives = 100 num_negatives = num_sequences - num_positives GC_fraction = 0.4 test_fraction = 0.2 num_epochs = 1 sequences, labels, embeddings = simulate_single_motif_detection( 'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction) assert sequences[0] == golden_first_sequence, 'first sequence = {}, golden = {}'.format( sequences[0], golden_first_sequence) encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def run(use_deep_CNN, use_RNN, label, golden_results): seq_length = 100 num_sequences = 200 test_fraction = 0.2 num_epochs = 1 sequences = np.array([''.join(random.choice('ACGT') for base in range(seq_length)) for sequence in range(num_sequences)]) labels = np.random.choice((True, False), size=num_sequences)[:, None] encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def train_test_dnn_vary_parameter(prefix, model_parameters, param_name, param_values, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None): X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) dnn_results = [] for param_value in param_values: model_parameters[param_name] = param_value ofname_infix = dict2string(model_parameters) ofname_prefix = "%s.%s" % (prefix, ofname_infix) model_fname = "%s.arch.json" % (ofname_prefix) weights_fname = "%s.weights.hd5" % (ofname_prefix) try: logger.debug("Checking for model files {} and {}...".format(model_fname, weights_fname)) dnn = SequenceDNN.load(model_fname, weights_fname) logger.debug("Model files found. Loaded model successfully!") except: logger.debug("Model files not found. Training model...") dnn = SequenceDNN(**model_parameters) logger.info("training with %s %s .." % (param_name, param_value)) dnn.train(X_train, y_train, (X_valid, y_valid)) dnn.save(model_fname, weights_fname) dnn_results.append(dnn.test(X_test, y_test)) return dnn_results
def train_test_dnn_vary_parameter(prefix, model_parameters, param_name, param_values, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None): X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) dnn_results = [] for param_value in param_values: model_parameters[param_name] = param_value ofname_infix = dict2string(model_parameters) ofname_prefix = "%s.%s" % (prefix, ofname_infix) model_fname = "%s.arch.json" % (ofname_prefix) weights_fname = "%s.weights.h5" % (ofname_prefix) try: logger.debug("Checking for model files {} and {}...".format( model_fname, weights_fname)) dnn = SequenceDNN.load(model_fname, weights_fname) logger.debug("Model files found. Loaded model successfully!") except: logger.debug("Model files not found. Training model...") dnn = SequenceDNN(**model_parameters) logger.info("training with %s %s .." % (param_name, param_value)) dnn.train(X_train, y_train, (X_valid, y_valid)) dnn.save(ofname_prefix) dnn_results.append(dnn.test(X_test, y_test)) return dnn_results
def train_test_dnn_vary_data_size(prefix, model_parameters=None, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None, train_set_sizes=None): dnn_results = [] for train_set_size in train_set_sizes: ofname_infix = dict2string(model_parameters) ofname_infix = "%s.train_set_size_%s" % (ofname_infix, str(train_set_size)) ofname_prefix = "%s.%s" % (prefix, ofname_infix) model_fname = "%s.arch.json" % (ofname_prefix) weights_fname = "%s.weights.h5" % (ofname_prefix) try: logger.debug("Checking for model files {} and {}...".format( model_fname, weights_fname)) best_dnn = SequenceDNN.load(model_fname, weights_fname) logger.debug("Model files found. Loaded model successfully!") except: logger.debug("Model files not found. Training model...") # try 3 attempts, take best auROC, save that model X_train_subset = X_train[:train_set_size] X_train_subset = np.concatenate( (X_train_subset, reverse_complement(X_train_subset))) y_train_subset = np.concatenate( (y_train[:train_set_size], y_train[:train_set_size])) best_auROC = 0 best_dnn = None for random_seed in [1, 2, 3]: np.random.seed(random_seed) random.seed(random_seed) dnn = SequenceDNN(**model_parameters) logger.info("training with %i examples.." % (train_set_size)) dnn.train(X_train_subset, y_train_subset, (X_valid, y_valid)) result = dnn.test(X_test, y_test) auROCs = [ result.results[i]["auROC"] for i in range(y_valid.shape[-1]) ] # get average auROC across tasks mean_auROC = sum(auROCs) / len(auROCs) if mean_auROC > best_auROC: best_auROC = mean_auROC dnn.save(ofname_prefix) best_dnn = dnn dnn_results.append(best_dnn.test(X_test, y_test)) # reset to original random seed np.random.seed(1) random.seed(1) return dnn_results
def run(use_deep_CNN, use_RNN, label, golden_results): import random np.random.seed(1) random.seed(1) from dragonn.models import SequenceDNN from simdna.simulations import simulate_single_motif_detection from dragonn.utils import one_hot_encode, reverse_complement from sklearn.cross_validation import train_test_split seq_length = 50 num_sequences = 100 num_positives = 50 num_negatives = num_sequences - num_positives GC_fraction = 0.4 test_fraction = 0.2 validation_fraction = 0.2 num_epochs = 1 sequences, labels = simulate_single_motif_detection( 'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction) encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train, X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=validation_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = { 'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45, ), 'pool_width': 25, 'conv_width': (10, ), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs } if use_deep_CNN: hyperparameters.update({ 'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5) }) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_valid, y_valid)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def main_train(pos_sequences=None, neg_sequences=None, pos_validation_sequences=None, neg_validation_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]] * len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]] * len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) #if a validation set is provided by the user, encode that as well if (pos_validation_sequences != None or neg_validation_sequences != None): #both positive and negative validation sequences must be provided. assert neg_validation_sequences != None assert pos_validation_sequences != None X_valid_pos = encode_fasta_sequences(pos_validation_sequences) X_valid_neg = encode_fasta_sequences(neg_validation_sequences) y_valid_pos = np.array([[True]]) * len(X_valid_pos) y_valid_neg = np.array([[False]]) * len(X_valid_neg) X_valid = np.concatenate((X_valid_pos, X_valid_neg)) y_valid = np.concatenate((y_valid_pos, y_valid_neg)) else: X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(model_hdf5_file, arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def train_test_dnn_vary_data_size(prefix, model_parameters=None, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None, train_set_sizes=None): dnn_results = [] for train_set_size in train_set_sizes: ofname_infix = dict2string(model_parameters) ofname_infix = "%s.train_set_size_%s" % (ofname_infix, str(train_set_size)) ofname_prefix = "%s.%s" % (prefix, ofname_infix) model_fname = "%s.arch.json" % (ofname_prefix) weights_fname = "%s.weights.hd5" % (ofname_prefix) try: logger.debug("Checking for model files {} and {}...".format(model_fname, weights_fname)) best_dnn = SequenceDNN.load(model_fname, weights_fname) logger.debug("Model files found. Loaded model successfully!") except: logger.debug("Model files not found. Training model...") # try 3 attempts, take best auROC, save that model X_train_subset = X_train[:train_set_size] X_train_subset = np.concatenate((X_train_subset, reverse_complement(X_train_subset))) y_train_subset = np.concatenate((y_train[:train_set_size], y_train[:train_set_size])) best_auROC = 0 best_dnn = None for random_seed in [1, 2, 3]: np.random.seed(random_seed) random.seed(random_seed) dnn = SequenceDNN(**model_parameters) logger.info("training with %i examples.." % (train_set_size)) dnn.train(X_train_subset, y_train_subset, (X_valid, y_valid)) result = dnn.test(X_test, y_test) auROCs = [result.results[i]["auROC"] for i in range(y_valid.shape[-1])] # get average auROC across tasks mean_auROC = sum(auROCs) / len(auROCs) if mean_auROC > best_auROC: best_auROC = mean_auROC dnn.save(model_fname, weights_fname) best_dnn = dnn dnn_results.append(best_dnn.test(X_test, y_test)) # reset to original random seed np.random.seed(1) random.seed(1) return dnn_results
def run(use_deep_CNN, use_RNN, label, golden_results): seq_length = 100 num_sequences = 200 test_fraction = 0.2 num_epochs = 1 sequences = np.array([ ''.join(random.choice('ACGT') for base in range(seq_length)) for sequence in range(num_sequences) ]) labels = np.random.choice((True, False), size=num_sequences)[:, None] encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = { 'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45, ), 'pool_width': 25, 'conv_width': (10, ), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs } if use_deep_CNN: hyperparameters.update({ 'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5) }) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] # Build and train model if not do_hyperparameter_search: hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_valid, y_valid)) else: print('Starting hyperparameter search...') from dragonn.hyperparameter_search import HyperparameterSearcher fixed_hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_epochs': num_epochs} grid = {'num_filters': ((5, 100),), 'pool_width': (5, 40), 'conv_width': ((6, 20),), 'dropout': (0, 0.5)} if use_deep_CNN: grid.update({'num_filters': ((5, 100), (5, 100), (5, 100)), 'conv_width': ((6, 20), (6, 20), (6, 20))}) if use_RNN: grid.update({'GRU_size': (10, 50), 'TDD_size': (20, 60)}) # Backend is RandomSearch; if using Python 2, can also specify MOESearch # (requires separate installation)
np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] # Build and train model if not do_hyperparameter_search: hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_valid, y_valid), save_best_model_to_prefix='best_model') else: print('Starting hyperparameter search...') from dragonn.hyperparameter_search import HyperparameterSearcher, RandomSearch fixed_hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_epochs': num_epochs} grid = {'num_filters': ((5, 100),), 'pool_width': (5, 40), 'conv_width': ((6, 20),), 'dropout': (0, 0.5)} if use_deep_CNN: grid.update({'num_filters': ((5, 100), (5, 100), (5, 100)), 'conv_width': ((6, 20), (6, 20), (6, 20))}) if use_RNN: grid.update({'GRU_size': (10, 50), 'TDD_size': (20, 60)}) # Backend is RandomSearch; if using Python 2, can also specify MOESearch # (requires separate installation)