def test(data, batch_size, model_file): model = load_model(model_file) print(np.round(model.layers[1].get_weights()[0], 3)) generator = DataGenerator(batch_size) generator.fit(data, data) steps = int(math.ceil(data.shape[0] / batch_size)) loss = model.evaluate_generator(generator, steps=steps) print('Test loss %f' % (loss, )) preds = model.predict_generator(generator, steps=steps) # preds = preds.reshape(data.shape[0], MAXLEN, 21) # preds = np.argmax(preds, axis=2) real = data.toarray() #.reshape(data.shape[0], MAXLEN) # real = np.argmax(real, axis=2) preds = np.round(preds * 20).astype(np.int32) real = np.round(real * 20).astype(np.int32) for i in range(10): c = 0 l = 0 lp = 0 print(preds[i]) print(real[i]) for j in range(len(real[i])): if real[i, j] != 0 and real[i, j] == preds[i, j]: c += 1 elif l == 0 and real[i, j] == 0: l = j if lp == 0 and preds[i, j] == 0: lp = j print('Match %d, Length %d, Length pred %d' % (c, l, lp))
def model(data, hidden_layers, hidden_neurons, output_file, validation_split=0.9): train_n = int(validation_split * len(data)) batch_size = 50 train_data = data[:train_n, :] val_data = data[train_n:, :] #data: data_num * data_dim input_sh = Input(shape=(data.shape[1], )) encoded = Dense(data.shape[1], activation='relu', activity_regularizer=regularizers.activity_l1l2( 10e-5, 10e-5))(input_sh) encoded = noise.GaussianNoise(0.2)(encoded) for i in range(hidden_layers): encoded = Dense(hidden_neurons[i], activation='relu', activity_regularizer=regularizers.activity_l1l2( 10e-5, 10e-5))(encoded) encoded = noise.GaussianNoise(0.2)(encoded) for j in range(hidden_layers - 1, -1, -1): decoded = Dense(hidden_neurons[i], activation='relu', activity_regularizer=regularizers.activity_l1l2( 10e-5, 10e-5))(encoded) decoded = Dense(data.shape[1], activation='sigmoid')(decoded) autoencoder = Model(input=input_sh, output=decoded) autoencoder.compile(optimizer='adadelta', loss='mse') checkpointer = ModelCheckpoint(filepath='data/bestmodel' + output_file + ".hdf5", verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=15, verbose=1) train_generator = DataGenerator(batch_size) train_generator.fit(train_data, train_data) val_generator = DataGenerator(batch_size) val_generator.fit(val_data, val_data) autoencoder.fit_generator(train_generator, samples_per_epoch=len(train_data), nb_epoch=100, validation_data=val_generator, nb_val_samples=len(val_data), max_q_size=batch_size, callbacks=[checkpointer, earlystopper]) enco = Model(input=input_sh, output=encoded) enco.compile(optimizer='adadelta', loss='mse') reprsn = enco.predict(data) return reprsn
def model(params, test_df, batch_size=b_size, nb_epoch=n_epoch, is_train=True): # set parameters: nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") test, test_df = load_data(test_df) test_gos = test_df['gos'].values test_data, test_labels = test logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Test data size: %d" % len(test_data[0])) model_path = (DATA_ROOT + 'models/model_' + FUNCTION + '.h5') checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1) logging.info('Starting training the model') batch_size_new = 2 test_generator = DataGenerator(batch_size_new, nb_classes) test_generator.fit(test_data, test_labels) logging.info('Loading best model') pred = {} start_time = time.time() with graph.as_default(): model = load_model(model_path) logging.info('Loading time: %d' % (time.time() - start_time)) start_time = time.time() preds = model.predict_generator(test_generator, val_samples=len(test_data[0])) running_time = time.time() - start_time logging.info('Running time: %d %d' % (running_time, len(test_data[0]))) logging.info('Computing performance') f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos) roc_auc = compute_roc(preds, test_labels) mcc = compute_mcc(preds_max, test_labels) logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t)) logging.info('ROC AUC: \t %f ' % (roc_auc, )) logging.info('MCC: \t %f ' % (mcc, )) print('f :%.3f & p: %.3f & r: %.3f & roc_auc: %.3f & mcc: %.3f' % (f, p, r, roc_auc, mcc)) proteins = test_df['proteins'] predictions = list() for i in xrange(preds_max.shape[0]): predictions.append(preds_max[i]) counter2 = 0 for ele in test_labels[0]: if ele == 1: counter2 = counter2 + 1 counter = 0 for ele in predictions[0]: if ele == 1: counter = counter + 1 prediction_list = find_the_predicted_go_term(predictions, functions) return prediction_list
def model(model_name): # set parameters: batch_size = 128 nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") data, targets = load_data() data_generator = DataGenerator(batch_size, nb_classes) data_generator.fit(data, None) logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Data size: %d" % len(data[0])) logging.info('Loading the model') with open(DATA_ROOT + model_name + '_' + FUNCTION + '.json', 'r') as f: json_string = next(f) model = model_from_json(json_string) optimizer = RMSprop() model.compile(optimizer=optimizer, loss='binary_crossentropy') model_path = DATA_ROOT + model_name + '_weights_' + FUNCTION + '.pkl' logging.info('Compilation finished in %d sec' % (time.time() - start_time)) logging.info('Loading weights') load_model_weights(model, model_path) logging.info('Predicting') preds = model.predict_generator(data_generator, val_samples=len(data[0]), nb_worker=12) for i in xrange(len(preds)): preds[i] = preds[i].reshape(-1, 1) preds = np.concatenate(preds, axis=1) incon = 0 for i in xrange(len(data)): for j in xrange(len(functions)): anchestors = get_anchestors(go, functions[j]) for p_id in anchestors: if (p_id not in [GO_ID, functions[j]] and preds[i, go_indexes[p_id]] < preds[i, j]): incon += 1 preds[i, go_indexes[p_id]] = preds[i, j] logging.info('Inconsistent predictions: %d' % incon) predictions = list() for i in xrange(len(targets)): predictions.append(preds[i]) df = pd.DataFrame({'targets': targets, 'predictions': predictions}) print(len(df)) df.to_pickle(DATA_ROOT + model_name + '_preds_' + FUNCTION + '.pkl') logging.info('Done in %d sec' % (time.time() - start_time))
def run_model(model_path='data/model.h5', batch_size=128): model = load_model(model_path) prots, data = load_data() data_generator = DataGenerator(batch_size) data_generator.fit(data) # Features layer model model = model.layers[1] steps = math.ceil(data.shape[0] / batch_size) output = model.predict_generator(data_generator, steps=steps, verbose=1) print(output) vectors = list() for i in range(output.shape[0]): vectors.append(output[i, :]) df = pd.DataFrame({'proteins': prots, 'vectors': vectors}) df.to_pickle('data/vectors.pkl')
def model(model_name): # set parameters: batch_size = 128 nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") data, targets = load_data() data_generator = DataGenerator(batch_size, nb_classes) data_generator.fit(data, None) logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Data size: %d" % len(data[0])) logging.info('Loading the model') model = load_model(DATA_ROOT + model_name + '_' + FUNCTION + '.h5') logging.info('Predicting') preds = model.predict_generator(data_generator, val_samples=len(data[0])) # incon = 0 # for i in xrange(len(data)): # for j in xrange(len(functions)): # anchestors = get_anchestors(go, functions[j]) # for p_id in anchestors: # if (p_id not in [GO_ID, functions[j]] and # preds[i, go_indexes[p_id]] < preds[i, j]): # incon += 1 # preds[i, go_indexes[p_id]] = preds[i, j] # logging.info('Inconsistent predictions: %d' % incon) predictions = list() for i in xrange(len(targets)): predictions.append(preds[i]) df = pd.DataFrame({'targets': targets, 'predictions': predictions}) print(len(df)) df.to_pickle(DATA_ROOT + model_name + '_preds_' + FUNCTION + '.pkl') logging.info('Done in %d sec' % (time.time() - start_time))
def train(data, batch_size, epochs, model_file, validation_split=0.8): index = np.arange(data.shape[0]) train_n = int(data.shape[0] * validation_split) train_data, valid_data = data[index[:train_n], :], data[index[train_n:], :] train_generator = DataGenerator(batch_size) train_generator.fit(train_data, train_data) valid_generator = DataGenerator(batch_size) valid_generator.fit(valid_data, valid_data) steps = int(math.ceil(train_n / batch_size)) valid_n = data.shape[0] - train_n valid_steps = int(math.ceil(valid_n / batch_size)) checkpointer = ModelCheckpoint(filepath=model_file, verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=100, verbose=1) model = build_model() model.fit_generator(train_generator, steps_per_epoch=steps, epochs=epochs, validation_data=valid_generator, validation_steps=valid_steps, callbacks=[earlystopper, checkpointer])
def model(batch_size=128, nb_epoch=100, is_train=True): # set parameters: nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") train, val, test, train_df, valid_df, test_df = load_data() train_df = pd.concat([train_df, valid_df]) test_gos = test_df['gos'].values train_data, train_labels = train val_data, val_labels = val test_data, test_labels = test logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Training data size: %d" % len(train_data)) logging.info("Validation data size: %d" % len(val_data)) logging.info("Test data size: %d" % len(test_data)) model_path = DATA_ROOT + 'models/model_seq_' + FUNCTION + '.h5' checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1) logging.info('Starting training the model') train_generator = DataGenerator(batch_size, nb_classes) train_generator.fit(train_data, train_labels) valid_generator = DataGenerator(batch_size, nb_classes) valid_generator.fit(val_data, val_labels) test_generator = DataGenerator(batch_size, nb_classes) test_generator.fit(test_data, test_labels) if is_train: model = get_model() model.fit_generator(train_generator, samples_per_epoch=len(train_data), nb_epoch=nb_epoch, validation_data=valid_generator, nb_val_samples=len(val_data), max_q_size=batch_size, callbacks=[checkpointer, earlystopper]) logging.info('Loading best model') model = load_model(model_path) model = model.layers[1] output = model.predict_generator(test_generator, val_samples=len(test_data)) print((output.shape)) return logging.info('Predicting') preds = model.predict_generator(test_generator, val_samples=len(test_data)) # incon = 0 # for i in range(len(test_data)): # for j in range(len(functions)): # childs = set(go[functions[j]]['children']).intersection(func_set) # ok = True # for n_id in childs: # if preds[i, j] < preds[i, go_indexes[n_id]]: # preds[i, j] = preds[i, go_indexes[n_id]] # ok = False # if not ok: # incon += 1 logging.info('Computing performance') f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos) roc_auc = compute_roc(preds, test_labels) mcc = compute_mcc(preds_max, test_labels) logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t)) logging.info('ROC AUC: \t %f ' % (roc_auc, )) logging.info('MCC: \t %f ' % (mcc, )) print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (f, p, r, roc_auc, mcc))) # logging.info('Inconsistent predictions: %d' % incon) # logging.info('Saving the predictions') # proteins = test_df['proteins'] # predictions = list() # for i in range(preds_max.shape[0]): # predictions.append(preds_max[i]) # df = pd.DataFrame( # { # 'proteins': proteins, 'predictions': predictions, # 'gos': test_df['gos'], 'labels': test_df['labels']}) # df.to_pickle(DATA_ROOT + 'test-' + FUNCTION + '-predictions.pkl') # logging.info('Done in %d sec' % (time.time() - start_time)) function_centric_performance(functions, preds.T, test_labels.T)
def model(batch_size=128, nb_epoch=100): # set parameters: nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") train, val, test, train_df, valid_df, test_df = load_data() train_df = pd.concat([train_df, valid_df]) test_gos = test_df['gos'].values train_data, train_labels = train val_data, val_labels = val test_data, test_labels = test logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Training data size: %d" % len(train_data[0])) logging.info("Validation data size: %d" % len(val_data[0])) logging.info("Test data size: %d" % len(test_data[0])) # pre_model_path = DATA_ROOT + 'pre_model_weights_' + FUNCTION + '.pkl' model_path = DATA_ROOT + 'model_' + FUNCTION + '.h5' checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1) logging.info('Starting training the model') train_generator = DataGenerator(batch_size, nb_classes) train_generator.fit(train_data, train_labels) valid_generator = DataGenerator(batch_size, nb_classes) valid_generator.fit(val_data, val_labels) test_generator = DataGenerator(batch_size, nb_classes) test_generator.fit(test_data, test_labels) # model = get_model() # model.fit_generator( # train_generator, # samples_per_epoch=len(train_data[0]), # nb_epoch=nb_epoch, # validation_data=valid_generator, # nb_val_samples=len(val_data[0]), # max_q_size=batch_size, # callbacks=[checkpointer, earlystopper]) logging.info('Loading best model') model = load_model(model_path) logging.info('Predicting') preds = model.predict_generator(test_generator, val_samples=len(test_data[0])) # incon = 0 # for i in range(len(test_data)): # for j in range(len(functions)): # childs = set(go[functions[j]]['children']).intersection(func_set) # ok = True # for n_id in childs: # if preds[i, j] < preds[i, go_indexes[n_id]]: # preds[i, j] = preds[i, go_indexes[n_id]] # ok = False # if not ok: # incon += 1 logging.info('Computing performance') f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos) roc_auc = compute_roc(preds, test_labels) logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t)) logging.info('ROC AUC: \t %f ' % (roc_auc, ))
def model(): # set parameters: batch_size = 128 nb_epoch = 100 nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") train, val, test, train_df, valid_df, test_df = load_data() train_df = pd.concat([train_df, valid_df]) test_gos = test_df['gos'].values train_data, train_labels = train val_data, val_labels = val test_data, test_labels = test logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Training data size: %d" % len(train_data[0])) logging.info("Validation data size: %d" % len(val_data[0])) logging.info("Test data size: %d" % len(test_data[0])) pre_model_path = DATA_ROOT + 'pre_model_weights_' + FUNCTION + '.pkl' model_path = DATA_ROOT + 'model_weights_' + FUNCTION + '.pkl' last_model_path = DATA_ROOT + 'model_weights_' + FUNCTION + '.last.pkl' checkpointer = MyCheckpoint(filepath=model_path, verbose=1, save_best_only=True, save_weights_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1) model = get_model() # logging.info('Loading pretrained weights') # load_model_weights(model, pre_model_path) logging.info('Starting training the model') train_generator = DataGenerator(batch_size, nb_classes) train_generator.fit(train_data, train_labels) valid_generator = DataGenerator(batch_size, nb_classes) valid_generator.fit(val_data, val_labels) test_generator = DataGenerator(batch_size, nb_classes) test_generator.fit(test_data, test_labels) # model.fit_generator( # train_generator, # samples_per_epoch=len(train_data[0]), # nb_epoch=nb_epoch, # validation_data=valid_generator, # nb_val_samples=len(val_data[0]), # max_q_size=batch_size, # callbacks=[checkpointer, earlystopper]) logging.info('Loading weights') load_model_weights(model, model_path) model.save(DATA_ROOT + 'model_%s.h5' % FUNCTION) logging.info('Predicting') preds = model.predict_generator(test_generator, val_samples=len(test_data[0])) # incon = 0 # for i in xrange(len(test_data)): # for j in xrange(len(functions)): # childs = set(go[functions[j]]['children']).intersection(func_set) # ok = True # for n_id in childs: # if preds[i, j] < preds[i, go_indexes[n_id]]: # preds[i, j] = preds[i, go_indexes[n_id]] # ok = False # if not ok: # incon += 1 logging.info('Computing performance') f, p, r, preds_max = compute_performance(preds, test_labels, test_gos) # roc_auc = compute_roc(preds, test_labels) # logging.info('Fmax measure: \t %f %f %f' % (f, p, r)) # logging.info('ROC AUC: \t %f ' % (roc_auc, )) # logging.info('Inconsistent predictions: %d' % incon) logging.info('Saving the predictions') proteins = test_df['proteins'] predictions = list() for i in xrange(preds_max.shape[0]): predictions.append(preds_max[i]) df = pd.DataFrame({ 'proteins': proteins, 'predictions': predictions, 'gos': test_df['gos'], 'labels': test_df['labels'] }) df.to_pickle(DATA_ROOT + 'test-' + FUNCTION + '-predictions.pkl') logging.info('Done in %d sec' % (time.time() - start_time))
def model(params, batch_size=128, nb_epoch=6, is_train=True): # set parameters: nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") train, val, test, train_df, valid_df, test_df = load_data() train_df = pd.concat([train_df, valid_df]) test_gos = test_df['gos'].values train_data, train_labels = train val_data, val_labels = val test_data, test_labels = test logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Training data size: %d" % len(train_data[0])) logging.info("Validation data size: %d" % len(val_data[0])) logging.info("Test data size: %d" % len(test_data[0])) model_path = (DATA_ROOT + 'models/model_' + FUNCTION + '.h5') # '-' + str(params['embedding_dims']) + # '-' + str(params['nb_filter']) + # '-' + str(params['nb_conv']) + # '-' + str(params['nb_dense']) + '.h5') checkpointer = ModelCheckpoint( filepath=model_path, verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1) logging.info('Starting training the model') train_generator = DataGenerator(batch_size, nb_classes) train_generator.fit(train_data, train_labels) valid_generator = DataGenerator(batch_size, nb_classes) valid_generator.fit(val_data, val_labels) test_generator = DataGenerator(batch_size, nb_classes) test_generator.fit(test_data, test_labels) if is_train: model = get_model(params) model.fit_generator( train_generator, samples_per_epoch=len(train_data[0]), nb_epoch=nb_epoch, validation_data=valid_generator, nb_val_samples=len(val_data[0]), max_q_size=batch_size, callbacks=[checkpointer, earlystopper]) logging.info('Loading best model') start_time = time.time() model = load_model(model_path) logging.info('Loading time: %d' % (time.time() - start_time)) # orgs = ['9606', '10090', '10116', '7227', '7955', # '559292', '3702', '284812', '6239', # '83333', '83332', '224308', '208964'] # for org in orgs: # logging.info('Predicting for %s' % (org,)) # train, val, test, train_df, valid_df, test_df = load_data(org=org) # test_data, test_labels = test # test_gos = test_df['gos'].values # test_generator = DataGenerator(batch_size, nb_classes) # test_generator.fit(test_data, test_labels) start_time = time.time() preds = model.predict_generator( test_generator, val_samples=len(test_data[0])) running_time = time.time() - start_time logging.info('Running time: %d %d' % (running_time, len(test_data[0]))) logging.info('Computing performance') f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos) roc_auc = compute_roc(preds, test_labels) mcc = compute_mcc(preds_max, test_labels) logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t)) logging.info('ROC AUC: \t %f ' % (roc_auc, )) logging.info('MCC: \t %f ' % (mcc, )) print(('%.3f & %.3f & %.3f & %.3f & %.3f' % ( f, p, r, roc_auc, mcc))) # return f # logging.info('Inconsistent predictions: %d' % incon) # logging.info('Saving the predictions') proteins = test_df['proteins'] predictions = list() for i in range(preds_max.shape[0]): predictions.append(preds_max[i]) df = pd.DataFrame( { 'proteins': proteins, 'predictions': predictions, 'gos': test_df['gos'], 'labels': test_df['labels']}) df.to_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds.pkl')
def model(params, batch_size=b_size, nb_epoch=n_epoch, is_train=True): # set parameters: nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") train, val, test, train_df, valid_df, test_df = load_data() print len(test_df) train_df = pd.concat([train_df, valid_df]) test_gos = test_df['gos'].values train_data, train_labels = train val_data, val_labels = val test_data, test_labels = test print len(test_labels) logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Training data size: %d" % len(train_data[0])) logging.info("Validation data size: %d" % len(val_data[0])) logging.info("Test data size: %d" % len(test_data[0])) model_path = (DATA_ROOT + 'models/model_' + FUNCTION + '.h5') checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1) logging.info('Starting training the model') print train_data train_generator = DataGenerator(batch_size, nb_classes) train_generator.fit(train_data, train_labels) valid_generator = DataGenerator(batch_size, nb_classes) valid_generator.fit(val_data, val_labels) test_generator = DataGenerator(batch_size, nb_classes) test_generator.fit(test_data, test_labels) is_train = True if is_train: model = get_model(params) model.fit_generator(train_generator, samples_per_epoch=len(train_data[0]), nb_epoch=nb_epoch, validation_data=valid_generator, nb_val_samples=len(val_data[0]), max_q_size=batch_size, callbacks=[checkpointer, earlystopper]) logging.info('Loading best model') start_time = time.time() model = load_model(model_path) logging.info('Loading time: %d' % (time.time() - start_time)) start_time = time.time() preds = model.predict_generator(test_generator, val_samples=len(test_data[0])) running_time = time.time() - start_time logging.info('Running time: %d %d' % (running_time, len(test_data[0]))) logging.info('Computing performance') # pred_file="pred"+FUNCTION+".txt" # test_file ="test"+FUNCTION+".txt" # gos_file = "test"+FUNCTION+"_goc.txt" # write_file(pred_file,preds) # write_file(test_file,test_labels) # write_file(gos_file,test_gos) f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos) roc_auc = compute_roc(preds, test_labels) mcc = compute_mcc(preds_max, test_labels) logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t)) logging.info('ROC AUC: \t %f ' % (roc_auc, )) logging.info('MCC: \t %f ' % (mcc, )) print('f :%.3f & p: %.3f & r: %.3f & roc_auc: %.3f & mcc: %.3f' % (f, p, r, roc_auc, mcc)) write_results([f, p, r, roc_auc, mcc]) proteins = test_df['proteins'] predictions = list() for i in xrange(preds_max.shape[0]): predictions.append(preds_max[i]) df = pd.DataFrame({ 'proteins': proteins, 'predictions': predictions, 'gos': test_df['gos'], 'labels': test_df['labels'] }) print df df.to_pickle('test' + FUNCTION + 'preds.pkl')
def train_model(batch_size=128, epochs=100, is_train=True, model_path='data/model.h5'): # set parameters: start_time = time.time() logging.info("Loading Data") train, valid, test = load_data() train_data, train_labels = train valid_data, valid_labels = valid test_data, test_labels = test logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Training data size: %d" % train_data.shape[0]) logging.info("Validation data size: %d" % valid_data.shape[0]) logging.info("Test data size: %d" % test_data.shape[0]) model_path = 'data/model.h5' checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1) logging.info('Starting training the model') train_generator = DataGenerator(batch_size) train_generator.fit(train_data, train_labels) valid_generator = DataGenerator(batch_size) valid_generator.fit(valid_data, valid_labels) test_generator = DataGenerator(batch_size) test_generator.fit(test_data, test_labels) if is_train: valid_steps = int(math.ceil(valid_data.shape[0] / batch_size)) train_steps = int(math.ceil(train_data.shape[0] / batch_size)) model = get_model() model.fit_generator(train_generator, steps_per_epoch=train_steps, epochs=epochs, validation_data=valid_generator, validation_steps=valid_steps, max_queue_size=batch_size, workers=12, callbacks=[checkpointer, earlystopper]) logging.info('Loading best model') model = load_model(model_path) logging.info('Predicting') test_steps = int(math.ceil(test_data.shape[0] / batch_size)) preds = model.predict_generator(test_generator, steps=test_steps, verbose=1) logging.info('Computing performance') test_labels = test_labels.toarray() f, p, r, t, preds_max = compute_performance(preds, test_labels) roc_auc = compute_roc(preds, test_labels) mcc = compute_mcc(preds_max, test_labels) logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t)) logging.info('ROC AUC: \t %f ' % (roc_auc, )) logging.info('MCC: \t %f ' % (mcc, )) print('%.3f & %.3f & %.3f & %.3f & %.3f' % (f, p, r, roc_auc, mcc))
def model(): # set parameters: batch_size = 128 nb_epoch = 100 nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") train, val, test, test_df = load_data() test_gos = test_df['gos'].values train_data, train_labels = train val_data, val_labels = val test_data, test_labels = test logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Training data size: %d" % len(train_data)) logging.info("Validation data size: %d" % len(val_data)) logging.info("Test data size: %d" % len(test_data)) logging.info("Building the model") inputs = Input(shape=(MAXLEN, ), dtype='int32', name='input1') feature_model = get_feature_model()(inputs) layers = get_layers(feature_model) output_models = [] for i in range(len(functions)): output_models.append(layers[functions[i]]['output']) net = merge(output_models, mode='concat', concat_axis=1) # net = Dense(nb_classes * 2, activation='relu')(feature_model) # net = Dense(nb_classes, activation='sigmoid')(net) # net = Activation('sigmoid')(net) model = Model(input=inputs, output=net) logging.info('Model built in %d sec' % (time.time() - start_time)) logging.info('Saving the model') model_json = model.to_json() with open(DATA_ROOT + 'model_seq_' + FUNCTION + '.json', 'w') as f: f.write(model_json) logging.info('Compiling the model') optimizer = RMSprop() model.compile(optimizer=optimizer, loss='binary_crossentropy') pre_model_path = DATA_ROOT + 'pre_model_seq_weights_' + FUNCTION + '.pkl' model_path = DATA_ROOT + 'model_seq_weights_' + FUNCTION + '.pkl' checkpointer = MyCheckpoint(filepath=model_path, verbose=1, save_best_only=True, save_weights_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1) logging.info('Compilation finished in %d sec' % (time.time() - start_time)) # logging.info('Loading pretrained weights') # load_model_weights(model, pre_model_path) logging.info('Starting training the model') train_generator = DataGenerator(batch_size, nb_classes) train_generator.fit(train_data, train_labels) valid_generator = DataGenerator(batch_size, nb_classes) valid_generator.fit(val_data, val_labels) test_generator = DataGenerator(batch_size, nb_classes) test_generator.fit(test_data, test_labels) # model.fit_generator( # train_generator, # samples_per_epoch=len(train_data), # nb_epoch=nb_epoch, # validation_data=valid_generator, # nb_val_samples=len(val_data), # max_q_size=batch_size, # callbacks=[checkpointer, earlystopper]) logging.info('Loading weights') load_model_weights(model, model_path) # model.save(DATA_ROOT + 'model_%s.h5' % FUNCTION) preds = model.predict_generator(test_generator, val_samples=len(test_data)) logging.info(preds.shape) incon = 0 # for i in xrange(len(test_data)): # for j in xrange(len(functions)): # childs = set(go[functions[j]]['children']).intersection(func_set) # ok = True # for n_id in childs: # if preds[i, j] < preds[i, go_indexes[n_id]]: # preds[i, j] = preds[i, go_indexes[n_id]] # ok = False # if not ok: # incon += 1 f, p, r, preds_max = compute_performance(preds, test_labels, test_gos) roc_auc = compute_roc(preds, test_labels) logging.info('Fmax measure: \t %f %f %f' % (f, p, r)) logging.info('ROC AUC: \t %f ' % (roc_auc, )) logging.info('Inconsistent predictions: %d' % incon) logging.info('Saving the predictions') proteins = test_df['proteins'] predictions = list() for i in xrange(preds_max.shape[0]): predictions.append(preds_max[i]) df = pd.DataFrame({ 'proteins': proteins, 'predictions': predictions, 'gos': test_df['gos'], 'labels': test_df['labels'] }) df.to_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds-seq.pkl') logging.info('Done in %d sec' % (time.time() - start_time)) function_centric_performance(functions, preds.T, test_labels.T)
def model(data, output_file, validation_split=0.9): #load data from xx into hidde md = open('neorons.txt', 'r') line = md.readline() print line cnmd = open('layer.txt', 'r') l = cnmd.readline() cnm = arg_as_list(line) hidden_neurons = cnm hidden_layers = int(l) train_n = int(validation_split * len(data)) batch_size = 50 e = 50 m = data.shape[1] if hidden_neurons[0] == 0: train_data = data val_data = data input_sh = Input(shape=(data.shape[1], )) encoded = noise.GaussianNoise(0.2)(input_sh) elif hidden_neurons[0] == 1: train_data = data[:train_n, :] val_data = data[train_n:, :] '''train_data = np.expand_dims(train_data, axis=0) val_data = np.expand_dims(val_data, axis=0)''' #train_data=tf.reshape(train_data,(train_n,data.shape[1],1)) input_sh = Input(shape=(data.shape[1], )) input_sh1 = Reshape((data.shape[1], 1))(input_sh) encoded = noise.GaussianNoise(0.2)(input_sh1) else: train_data = data[:train_n, :] val_data = data[train_n:, :] input_sh = Input(shape=(data.shape[1], )) encoded = noise.GaussianNoise(0.2)(input_sh) print 'ghhhhhhhhhhhhhhh ', encoded.ndim print hidden_neurons print 'layer ', hidden_layers if hidden_neurons[0] == 0: batch_size = len(data) encoded = GraphConvolution( input_dim=data.shape[1], output_dim=hidden_neurons[1], support=input_sh, act=tf.nn.relu, )(encoded) elif hidden_neurons[0] == 1: encoded = Conv1D(filters=1, kernel_size=178 - hidden_neurons[1], activation='relu')(encoded) encoded = Flatten()(encoded) print 'ghhhhhhhhhhhhhhh ', encoded.ndim elif hidden_neurons[0] == 2: encoded = Dense(hidden_neurons[1], activation='relu')(encoded) encoded = noise.GaussianNoise(0.2)(encoded) for i in range(2, hidden_layers): print i, hidden_neurons[i] print 'ghhhhhhhhhhhhhhh dense ', encoded.ndim encoded = Dense(hidden_neurons[i], activation='relu')(encoded) encoded = noise.GaussianNoise(0.2)(encoded) decoded = Dense(hidden_neurons[-2], activation='relu')(encoded) print hidden_neurons[-2] for j in range(hidden_layers - 3, 0, -1): print 'jhjjjjj ', j, hidden_neurons[j] decoded = Dense(hidden_neurons[j], activation='relu')(decoded) print data.shape[1] decoded = Dense(data.shape[1], activation='sigmoid')(decoded) autoencoder = Model(inputs=input_sh, outputs=decoded) #bp to train weights autoencoder.compile(optimizer='adadelta', loss='mse') #checkpointer = ModelCheckpoint(filepath='bestmodel' + output_file + ".hdf5", verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=15, verbose=1) if hidden_neurons[0] == 1: train_data1 = data[:train_n, :] val_data1 = data[train_n:, :] train_generator = DataGenerator(batch_size) print 'dfffffffffffffff', train_data.shape print 'dfffffffffffffff', train_data1.shape train_generator.fit(train_data, train_data) val_generator = DataGenerator(batch_size) val_generator.fit(val_data, val_data) else: train_generator = DataGenerator(batch_size) train_generator.fit(train_data, train_data) val_generator = DataGenerator(batch_size) val_generator.fit(val_data, val_data) h = autoencoder.fit_generator( train_generator, steps_per_epoch=len(data) / batch_size, epochs=e, validation_data=val_generator, validation_steps=len(data), callbacks=[earlystopper], ) #print 'jkjjkjkkjjk ', h.history avge = h.history['val_loss'][e - 1] print 'avge ', avge enco = Model(inputs=input_sh, outputs=encoded) #just select the first embedding part enco.compile(optimizer='adadelta', loss='mse') #configuration ? reprsn = enco.predict(data, batch_size=batch_size) return reprsn, avge