def test_GetResults(): testInput = pickle.load(open("data_testing/correctOneHotIn.pickle", "rb")) testCorrectOut = pickle.load(open("data_testing/correctOneHotOut.pickle", "rb")) getOneHotOut = ld.get_onehot(testInput, None, num_classes=3, seq_len=20) # print(testInput) # print(testCorrectOut) for x,y in zip(testCorrectOut,getOneHotOut): assert np.equal(x,y).all()
pair_dict[y] = [single_dict[y], x] else: single_dict[y] = x if len(pair_dict) == num_classes: break chosen_data = [] for i in range(2): for y in pair_dict: x = pair_dict[y][i] # print len(x) chosen_data.append((x, y)) x, y, m = get_onehot(chosen_data, None, is_dna_data=is_dna_data, seq_len=seq_len, mask_len=mask_len if mask else None) embed = embed_model.predict([x, m] if mask else x) pos_counts = dict() correct_counts = dict() for n in top_n: pos_counts[n] = [] correct_counts[n] = 0.0 for _ in range(n): pos_counts[n].append(0) for i in range(num_classes): distances = dict() ex = embed[i + num_classes]
model_name = 'blstm_dna_conv3_4500' data_file = '/mnt/data/computervision/dna_train80_val10_test10/test.csv' #data_file = '/mnt/data/computervision/dna_train80_val10_test10/unknowns.csv' data_divide = 4 dist_min = 0 dist_max = 20 model_file = '../models/' + model_name + '.h5' model = load_model(model_file) av_model = Model(inputs=model.input, outputs=model.get_layer("AV").output) print av_model.summary() data = load_csv(data_file, divide=data_divide) print len(data) x, y = get_onehot(data, None, is_dna_data=is_dna_data, seq_len=4500 if is_dna_data else 1500) avs = av_model.predict(x, batch_size=500) print 'done getting avs' del data, x, y means = [] with open('../results/' + model_name + '_mean_activations.csv', 'r') as infile: r = csv.reader(infile) for row in r: means.append(np.array(row, dtype=np.float32)) dists = [] with open('../results/' + model_name + '_mav_distances.csv', 'r') as infile: r = csv.reader(infile)
num_letters = 4 if is_dna_data else 26 model = model_template(num_classes, num_letters, sequence_length, embed_size=256, mask_length=mask_len if mask else None) model.load_weights(model_file) model.summary() test_data = load_csv(data_dir + '/test.csv', divide=2 if is_dna_data else 1) print len(test_data) crop_count = 0.0 for seq, y in test_data: if len(seq) > sequence_length: crop_count += 1 print "percent cropped: ", crop_count / len(test_data) test_x, test_y, test_m = get_onehot(test_data, None, is_dna_data=is_dna_data, seq_len=sequence_length, num_classes=num_classes, rand_start=random_crop, mask_len=mask_len if mask else None) if print_acc: print "test accuracy: ", model.evaluate([test_x, test_m] if mask else test_x, test_y, batch_size=100) if save_stats: pred = model.predict([test_x, test_m] if mask else test_x, batch_size=100).argmax(axis=-1) log = Logger(model_name, num_classes, sequence_length) log.confusion_matrix(test_data,pred) log.length_stats(test_data,pred) log.length_histograms(test_data,pred) log.save()
results = [] for percent in range(2,22,2): #mode 0: substitute, mode 1: 3-aligned cut, mode 2: unaligned cut row = [percent] for mode in range(3): test_data = load_csv(data_dir + '/test.csv', divide=2) print len(test_data) for i in range(len(test_data)): (x, y) = test_data[i] if mode == 0: test_data[i] = (substitute(x, percent), y) else: test_data[i] = (delete_segment(x, percent, mode == 1), y) #if i % 100000 == 99999: # print i+1 test_x, test_y, test_m = get_onehot(test_data, None, is_dna_data=True, seq_len=sequence_length, num_classes=num_classes, mask_len=mask_len) acc = model.evaluate([test_x, test_m], test_y, batch_size=100, verbose=1)[1] print percent, mode, acc row.append(acc) del test_data, test_x, test_y, test_m results.append(row) with open('../results/'+model_name+'_mutation_graphs.csv', 'w') as outfile: w = csv.writer(outfile) for row in results: w.writerow(row)
i += 1 print i N = len(sequence_dict) print 'done loading', N for i in range(N): print len(sequence_dict[i]) filename = '/mnt/data/computervision/tara/embed64/' + rev_label_dict[ i] + '.npy' if os.path.exists(filename): embed_dict[i] = np.load(filename) else: x, y, m = get_onehot(sequence_dict[i], None, is_dna_data=is_dna_data, seq_len=seq_len, mask_len=mask_len) embed = embed_model.predict([x, m], batch_size=100, verbose=1) embed_dict[i] = embed del x, y, m np.save(filename, embed) del sequence_dict[i] print 'embedded', i, rev_label_dict[i] #embed_dict[i] = embed_dict[i][0:1000] del sequence_dict, model, embed_model result = [] tree_dict = dict()
#read the first two columns of the input csv file into a list of tuples. #the file's second-column items become the first items in the tuples. #the list of tuples is called train_data train_data = load_csv(data_dir + '/train.csv') print(len(train_data)) #val_data = load_csv(data_dir + '/validation.csv', divide=2 if is_dna_data else 1) #val_x, val_y = get_onehot(val_data, None, num_classes=num_classes, seq_len=sequence_length, is_dna_data=is_dna_data) #print(len(val_data)) num_episodes = 50000#200000 # Each iteration currently takes about 6 secs, so cutting num_episodes # way down to be able to get to end of process in reasonable time. num_episodes = 5 for i in range(num_episodes): x, y, m = get_onehot(train_data, 100, num_classes=num_classes, seq_len=sequence_length, is_dna_data=is_dna_data, mask_len=mask_len if mask else None) print(i) print(model.train_on_batch([x,m] if mask else x, y)) if (i % 10000 == 0) or i == num_episodes - 1: #[loss, acc] = model.evaluate(val_x, val_y, batch_size=100) #print loss, acc #logger.record_val_acc(i, acc) model.save(save_path) print('saved to ' + save_path) del train_data #pred = model.predict(val_x, batch_size=100).argmax(axis=-1) #logger.confusion_matrix(val_data, pred) #logger.length_plot(val_data, pred)
model = load_model(model_file) av_model = Model(inputs=model.input, outputs=model.get_layer("AV").output) print av_model.summary() train_data = load_csv(data_dir + '/train.csv') batch_size = 10000 avs = [] actual = [] lower = 0 while lower < len(train_data): print lower upper = min(lower + batch_size, len(train_data)) x, y = get_onehot(train_data[lower:upper], None, is_dna_data=is_dna_data, seq_len=seq_len) pred = av_model.predict(x, batch_size=500) avs.append(pred) actual.append(y) lower += batch_size del train_data sums = np.zeros((num_classes, num_classes), np.float32) counts = np.zeros((num_classes), np.float32) class_avs = [] for i in range(num_classes): class_avs.append([]) for i in range(len(avs)):
num_amino_acids = 26 model = Sequential() model.add(Masking(mask_value=0, input_shape=(1500, num_amino_acids))) model.add(LSTM(50, activation='tanh')) model.add(Dense(num_classes, activation='softmax')) model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy']) model.summary() data_dir = '/mnt/data/computervision/train80_val10_test10' train_data = load_csv(data_dir + '/train.csv') print len(train_data) val_data = load_csv(data_dir + '/validation.csv') val_x, val_y = get_onehot(val_data, None) print len(val_data) logger = Logger('lstm50') save_path = '../models/lstm50.h5' num_episodes = 20000 for i in range(num_episodes): x, y = get_onehot(train_data, 1000) print i print model.train_on_batch(x, y) if (i % 1000 == 0) or i == num_episodes - 1: [loss, acc] = model.evaluate(val_x, val_y, batch_size=1000) print loss, acc