def run(use_deep_CNN, use_RNN, label, golden_results): seq_length = 100 num_sequences = 200 test_fraction = 0.2 num_epochs = 1 sequences = np.array([''.join(random.choice('ACGT') for base in range(seq_length)) for sequence in range(num_sequences)]) labels = np.random.choice((True, False), size=num_sequences)[:, None] encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def run(use_deep_CNN, use_RNN, label, golden_first_sequence, golden_results): seq_length = 100 num_sequences = 200 num_positives = 100 num_negatives = num_sequences - num_positives GC_fraction = 0.4 test_fraction = 0.2 num_epochs = 1 sequences, labels, embeddings = simulate_single_motif_detection( 'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction) assert sequences[0] == golden_first_sequence, 'first sequence = {}, golden = {}'.format( sequences[0], golden_first_sequence) encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def run(use_deep_CNN, use_RNN, label, golden_results): import random np.random.seed(1) random.seed(1) from dragonn.models import SequenceDNN from simdna.simulations import simulate_single_motif_detection from dragonn.utils import one_hot_encode, reverse_complement from sklearn.cross_validation import train_test_split seq_length = 50 num_sequences = 100 num_positives = 50 num_negatives = num_sequences - num_positives GC_fraction = 0.4 test_fraction = 0.2 validation_fraction = 0.2 num_epochs = 1 sequences, labels = simulate_single_motif_detection( 'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction) encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train, X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=validation_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = { 'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45, ), 'pool_width': 25, 'conv_width': (10, ), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs } if use_deep_CNN: hyperparameters.update({ 'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5) }) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_valid, y_valid)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def train_test_dnn_vary_parameter(prefix, model_parameters, param_name, param_values, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None): X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) dnn_results = [] for param_value in param_values: model_parameters[param_name] = param_value ofname_infix = dict2string(model_parameters) ofname_prefix = "%s.%s" % (prefix, ofname_infix) model_fname = "%s.arch.json" % (ofname_prefix) weights_fname = "%s.weights.h5" % (ofname_prefix) try: logger.debug("Checking for model files {} and {}...".format( model_fname, weights_fname)) dnn = SequenceDNN.load(model_fname, weights_fname) logger.debug("Model files found. Loaded model successfully!") except: logger.debug("Model files not found. Training model...") dnn = SequenceDNN(**model_parameters) logger.info("training with %s %s .." % (param_name, param_value)) dnn.train(X_train, y_train, (X_valid, y_valid)) dnn.save(ofname_prefix) dnn_results.append(dnn.test(X_test, y_test)) return dnn_results
def main_train(pos_sequences=None, neg_sequences=None, prefix=None, model_file=None, weights_file=None): # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]]*len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]]*len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if model_file is not None and weights_file is not None: # load model print("loading model...") model = SequenceDNN.load(model_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1]) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save("%s.model.json" % (prefix), "%s.weights.hd5" % (prefix)) print("Done!")
def main_train(pos_sequences=None, neg_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]] * len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]] * len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def train_test_dnn_vary_parameter(prefix, model_parameters, param_name, param_values, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None): X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) dnn_results = [] for param_value in param_values: model_parameters[param_name] = param_value ofname_infix = dict2string(model_parameters) ofname_prefix = "%s.%s" % (prefix, ofname_infix) model_fname = "%s.arch.json" % (ofname_prefix) weights_fname = "%s.weights.hd5" % (ofname_prefix) try: logger.debug("Checking for model files {} and {}...".format(model_fname, weights_fname)) dnn = SequenceDNN.load(model_fname, weights_fname) logger.debug("Model files found. Loaded model successfully!") except: logger.debug("Model files not found. Training model...") dnn = SequenceDNN(**model_parameters) logger.info("training with %s %s .." % (param_name, param_value)) dnn.train(X_train, y_train, (X_valid, y_valid)) dnn.save(model_fname, weights_fname) dnn_results.append(dnn.test(X_test, y_test)) return dnn_results
def train_test_dnn_vary_data_size(prefix, model_parameters=None, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None, train_set_sizes=None): dnn_results = [] for train_set_size in train_set_sizes: ofname_infix = dict2string(model_parameters) ofname_infix = "%s.train_set_size_%s" % (ofname_infix, str(train_set_size)) ofname_prefix = "%s.%s" % (prefix, ofname_infix) model_fname = "%s.arch.json" % (ofname_prefix) weights_fname = "%s.weights.h5" % (ofname_prefix) try: logger.debug("Checking for model files {} and {}...".format( model_fname, weights_fname)) best_dnn = SequenceDNN.load(model_fname, weights_fname) logger.debug("Model files found. Loaded model successfully!") except: logger.debug("Model files not found. Training model...") # try 3 attempts, take best auROC, save that model X_train_subset = X_train[:train_set_size] X_train_subset = np.concatenate( (X_train_subset, reverse_complement(X_train_subset))) y_train_subset = np.concatenate( (y_train[:train_set_size], y_train[:train_set_size])) best_auROC = 0 best_dnn = None for random_seed in [1, 2, 3]: np.random.seed(random_seed) random.seed(random_seed) dnn = SequenceDNN(**model_parameters) logger.info("training with %i examples.." % (train_set_size)) dnn.train(X_train_subset, y_train_subset, (X_valid, y_valid)) result = dnn.test(X_test, y_test) auROCs = [ result.results[i]["auROC"] for i in range(y_valid.shape[-1]) ] # get average auROC across tasks mean_auROC = sum(auROCs) / len(auROCs) if mean_auROC > best_auROC: best_auROC = mean_auROC dnn.save(ofname_prefix) best_dnn = dnn dnn_results.append(best_dnn.test(X_test, y_test)) # reset to original random seed np.random.seed(1) random.seed(1) return dnn_results
def run(use_deep_CNN, use_RNN, label, golden_results): seq_length = 100 num_sequences = 200 test_fraction = 0.2 num_epochs = 1 sequences = np.array([ ''.join(random.choice('ACGT') for base in range(seq_length)) for sequence in range(num_sequences) ]) labels = np.random.choice((True, False), size=num_sequences)[:, None] encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = { 'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45, ), 'pool_width': 25, 'conv_width': (10, ), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs } if use_deep_CNN: hyperparameters.update({ 'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5) }) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def main_train(pos_sequences=None, neg_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]]*len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]]*len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def main_predict(sequences=None, model_file=None, weights_file=None, output_file=None): # encode fasta print("loading sequence data...") X = encode_fasta_sequences(sequences) # load model print("loading model...") model = SequenceDNN.load(model_file, weights_file) # predict print("getting predictions...") predictions = model.predict(X) # save predictions print("saving predictions to output file...") np.savetxt(output_file, predictions) print("Done!")
def main_train(pos_sequences=None, neg_sequences=None, pos_validation_sequences=None, neg_validation_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]] * len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]] * len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) #if a validation set is provided by the user, encode that as well if (pos_validation_sequences != None or neg_validation_sequences != None): #both positive and negative validation sequences must be provided. assert neg_validation_sequences != None assert pos_validation_sequences != None X_valid_pos = encode_fasta_sequences(pos_validation_sequences) X_valid_neg = encode_fasta_sequences(neg_validation_sequences) y_valid_pos = np.array([[True]]) * len(X_valid_pos) y_valid_neg = np.array([[False]]) * len(X_valid_neg) X_valid = np.concatenate((X_valid_pos, X_valid_neg)) y_valid = np.concatenate((y_valid_pos, y_valid_neg)) else: X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(model_hdf5_file, arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def main_test(pos_sequences=None, neg_sequences=None, model_file=None, weights_file=None): # encode fastas print("loading sequence data...") X_test_pos = encode_fasta_sequences(pos_sequences) y_test_pos = np.array([[True]]*len(X_test_pos)) X_test_neg = encode_fasta_sequences(neg_sequences) y_test_neg = np.array([[False]]*len(X_test_neg)) X_test = np.concatenate((X_test_pos, X_test_neg)) y_test = np.concatenate((y_test_pos, y_test_neg)) # load model print("loading model...") model = SequenceDNN.load(model_file, weights_file) # test print("testing model...") test_result = model.test(X_test, y_test) print(test_result)
def main(args): ''' args - parsed arguments that include pos_sequences, neg_sequences, arch_file, and weights_file ''' # encode fasta print('Loading sequence data...') pos_seq = encode_fasta_sequences(args.pos_sequences) print('{} positive test sequences'.format(len(pos_seq))) neg_seq = encode_fasta_sequences(args.neg_sequences) print('{} negative test sequences\n'.format(len(neg_seq))) # load model prefix = args.arch_file.replace('.arch.json', '') print('Loading {} model...'.format(prefix)) model = SequenceDNN.load(args.arch_file, args.weights_file) # predict binding probability on test sequences print('Getting predictions...') pos_predictions = model.predict(pos_seq) for index, pred in enumerate(pos_predictions): print('positive_test_{}\tP(bound)={}'.format(index, pred[0])) print('') neg_predictions = model.predict(neg_seq) for index, pred in enumerate(neg_predictions): print('negative_test_{}\tP(bound)={}'.format(index, pred[0])) print('') # visualize trained model and motifs print('Plotting deeplift scores on positive sequences...') model.plot_deeplift(pos_seq, '{}_deeplift_positive'.format(prefix)) print('Plotting true motifs...') motif_names = ['IRF_known1', 'NFKB_known1'] for index, motif in enumerate(motif_names): fig = plot_motif(motif, figsize=(10, 4), ylab=motif) fig.savefig('motif{}.png'.format(index + 1), bbox_inches='tight') print('Plotting architecture...') model.plot_architecture('{}_architecture.png'.format(prefix)) print('Plotting convolutional filters...') plot_sequence_filters(model, prefix)
def train_test_dnn_vary_data_size(prefix, model_parameters=None, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None, train_set_sizes=None): dnn_results = [] for train_set_size in train_set_sizes: ofname_infix = dict2string(model_parameters) ofname_infix = "%s.train_set_size_%s" % (ofname_infix, str(train_set_size)) ofname_prefix = "%s.%s" % (prefix, ofname_infix) model_fname = "%s.arch.json" % (ofname_prefix) weights_fname = "%s.weights.hd5" % (ofname_prefix) try: logger.debug("Checking for model files {} and {}...".format(model_fname, weights_fname)) best_dnn = SequenceDNN.load(model_fname, weights_fname) logger.debug("Model files found. Loaded model successfully!") except: logger.debug("Model files not found. Training model...") # try 3 attempts, take best auROC, save that model X_train_subset = X_train[:train_set_size] X_train_subset = np.concatenate((X_train_subset, reverse_complement(X_train_subset))) y_train_subset = np.concatenate((y_train[:train_set_size], y_train[:train_set_size])) best_auROC = 0 best_dnn = None for random_seed in [1, 2, 3]: np.random.seed(random_seed) random.seed(random_seed) dnn = SequenceDNN(**model_parameters) logger.info("training with %i examples.." % (train_set_size)) dnn.train(X_train_subset, y_train_subset, (X_valid, y_valid)) result = dnn.test(X_test, y_test) auROCs = [result.results[i]["auROC"] for i in range(y_valid.shape[-1])] # get average auROC across tasks mean_auROC = sum(auROCs) / len(auROCs) if mean_auROC > best_auROC: best_auROC = mean_auROC dnn.save(model_fname, weights_fname) best_dnn = dnn dnn_results.append(best_dnn.test(X_test, y_test)) # reset to original random seed np.random.seed(1) random.seed(1) return dnn_results
def get_SequenceDNN(SequenceDNN_parameters): return SequenceDNN(**SequenceDNN_parameters)
random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] # Build and train model if not do_hyperparameter_search: hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_valid, y_valid)) else: print('Starting hyperparameter search...') from dragonn.hyperparameter_search import HyperparameterSearcher fixed_hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_epochs': num_epochs} grid = {'num_filters': ((5, 100),), 'pool_width': (5, 40), 'conv_width': ((6, 20),), 'dropout': (0, 0.5)} if use_deep_CNN: grid.update({'num_filters': ((5, 100), (5, 100), (5, 100)), 'conv_width': ((6, 20), (6, 20), (6, 20))}) if use_RNN: grid.update({'GRU_size': (10, 50), 'TDD_size': (20, 60)}) # Backend is RandomSearch; if using Python 2, can also specify MOESearch
random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] # Build and train model if not do_hyperparameter_search: hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_valid, y_valid), save_best_model_to_prefix='best_model') else: print('Starting hyperparameter search...') from dragonn.hyperparameter_search import HyperparameterSearcher, RandomSearch fixed_hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_epochs': num_epochs} grid = {'num_filters': ((5, 100),), 'pool_width': (5, 40), 'conv_width': ((6, 20),), 'dropout': (0, 0.5)} if use_deep_CNN: grid.update({'num_filters': ((5, 100), (5, 100), (5, 100)), 'conv_width': ((6, 20), (6, 20), (6, 20))}) if use_RNN: grid.update({'GRU_size': (10, 50), 'TDD_size': (20, 60)})
random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] # Build model, train and test if not do_hyperparameter_search: hyperparameters = {'seq_length': seq_length, 'use_deep_CNN': use_deep_CNN, 'use_RNN': use_RNN, 'num_filters': 45, 'pool_width': 25, 'conv_width': 10, 'L1': 0, 'dropout': 0.2} if use_deep_CNN: hyperparameters.update({'num_filters_2': 50, 'conv_width_2': 8, 'num_filters_3': 50, 'conv_width_3': 5}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_valid, y_valid)) print('Test results: {}'.format(model.test(X_test, y_test))) else: print('Starting hyperparameter search...') from hyperparameter_search import HyperparameterSearcher fixed_hyperparameters = {'seq_length': seq_length, 'use_deep_CNN': use_deep_CNN, 'use_RNN': use_RNN} grid = {'num_filters': (5, 100), 'pool_width': (5, 40), 'conv_width': (6, 20), 'dropout': (0, 0.5)} if use_deep_CNN: grid.update({'num_filters_2': (5, 100), 'conv_width_2': (6, 20), 'num_filters_3': (5, 100), 'conv_width_3': (6, 20), }) if use_RNN: grid.update({'GRU_size': (10, 50), 'TDD_size': (20, 60)})