示例#1
0
def test(data, batch_size, model_file):
    model = load_model(model_file)
    print(np.round(model.layers[1].get_weights()[0], 3))
    generator = DataGenerator(batch_size)
    generator.fit(data, data)
    steps = int(math.ceil(data.shape[0] / batch_size))
    loss = model.evaluate_generator(generator, steps=steps)
    print('Test loss %f' % (loss, ))

    preds = model.predict_generator(generator, steps=steps)
    # preds = preds.reshape(data.shape[0], MAXLEN, 21)
    # preds = np.argmax(preds, axis=2)
    real = data.toarray()  #.reshape(data.shape[0], MAXLEN)
    # real = np.argmax(real, axis=2)
    preds = np.round(preds * 20).astype(np.int32)
    real = np.round(real * 20).astype(np.int32)
    for i in range(10):
        c = 0
        l = 0
        lp = 0
        print(preds[i])
        print(real[i])
        for j in range(len(real[i])):
            if real[i, j] != 0 and real[i, j] == preds[i, j]:
                c += 1
            elif l == 0 and real[i, j] == 0:
                l = j
            if lp == 0 and preds[i, j] == 0:
                lp = j
        print('Match %d, Length %d, Length pred %d' % (c, l, lp))
示例#2
0
def model(data,
          hidden_layers,
          hidden_neurons,
          output_file,
          validation_split=0.9):

    train_n = int(validation_split * len(data))
    batch_size = 50
    train_data = data[:train_n, :]
    val_data = data[train_n:, :]
    #data: data_num * data_dim
    input_sh = Input(shape=(data.shape[1], ))
    encoded = Dense(data.shape[1],
                    activation='relu',
                    activity_regularizer=regularizers.activity_l1l2(
                        10e-5, 10e-5))(input_sh)
    encoded = noise.GaussianNoise(0.2)(encoded)
    for i in range(hidden_layers):
        encoded = Dense(hidden_neurons[i],
                        activation='relu',
                        activity_regularizer=regularizers.activity_l1l2(
                            10e-5, 10e-5))(encoded)
        encoded = noise.GaussianNoise(0.2)(encoded)
    for j in range(hidden_layers - 1, -1, -1):
        decoded = Dense(hidden_neurons[i],
                        activation='relu',
                        activity_regularizer=regularizers.activity_l1l2(
                            10e-5, 10e-5))(encoded)
    decoded = Dense(data.shape[1], activation='sigmoid')(decoded)

    autoencoder = Model(input=input_sh, output=decoded)
    autoencoder.compile(optimizer='adadelta', loss='mse')

    checkpointer = ModelCheckpoint(filepath='data/bestmodel' + output_file +
                                   ".hdf5",
                                   verbose=1,
                                   save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=15, verbose=1)

    train_generator = DataGenerator(batch_size)
    train_generator.fit(train_data, train_data)
    val_generator = DataGenerator(batch_size)
    val_generator.fit(val_data, val_data)

    autoencoder.fit_generator(train_generator,
                              samples_per_epoch=len(train_data),
                              nb_epoch=100,
                              validation_data=val_generator,
                              nb_val_samples=len(val_data),
                              max_q_size=batch_size,
                              callbacks=[checkpointer, earlystopper])

    enco = Model(input=input_sh, output=encoded)
    enco.compile(optimizer='adadelta', loss='mse')
    reprsn = enco.predict(data)

    return reprsn
def model(params, test_df, batch_size=b_size, nb_epoch=n_epoch, is_train=True):
    # set parameters:
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    test, test_df = load_data(test_df)
    test_gos = test_df['gos'].values
    test_data, test_labels = test
    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Test data size: %d" % len(test_data[0]))
    model_path = (DATA_ROOT + 'models/model_' + FUNCTION + '.h5')
    checkpointer = ModelCheckpoint(filepath=model_path,
                                   verbose=1,
                                   save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
    logging.info('Starting training the model')
    batch_size_new = 2
    test_generator = DataGenerator(batch_size_new, nb_classes)
    test_generator.fit(test_data, test_labels)
    logging.info('Loading best model')
    pred = {}
    start_time = time.time()
    with graph.as_default():
        model = load_model(model_path)
        logging.info('Loading time: %d' % (time.time() - start_time))
        start_time = time.time()
        preds = model.predict_generator(test_generator,
                                        val_samples=len(test_data[0]))
    running_time = time.time() - start_time
    logging.info('Running time: %d %d' % (running_time, len(test_data[0])))
    logging.info('Computing performance')
    f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos)
    roc_auc = compute_roc(preds, test_labels)
    mcc = compute_mcc(preds_max, test_labels)
    logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
    logging.info('ROC AUC: \t %f ' % (roc_auc, ))
    logging.info('MCC: \t %f ' % (mcc, ))
    print('f :%.3f & p: %.3f & r: %.3f & roc_auc: %.3f & mcc: %.3f' %
          (f, p, r, roc_auc, mcc))
    proteins = test_df['proteins']
    predictions = list()
    for i in xrange(preds_max.shape[0]):
        predictions.append(preds_max[i])
    counter2 = 0
    for ele in test_labels[0]:
        if ele == 1:
            counter2 = counter2 + 1

    counter = 0
    for ele in predictions[0]:
        if ele == 1:
            counter = counter + 1

    prediction_list = find_the_predicted_go_term(predictions, functions)
    return prediction_list
示例#4
0
def model(model_name):
    # set parameters:
    batch_size = 128
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    data, targets = load_data()
    data_generator = DataGenerator(batch_size, nb_classes)
    data_generator.fit(data, None)

    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Data size: %d" % len(data[0]))
    logging.info('Loading the model')
    with open(DATA_ROOT + model_name + '_' + FUNCTION + '.json', 'r') as f:
        json_string = next(f)
    model = model_from_json(json_string)

    optimizer = RMSprop()
    model.compile(optimizer=optimizer, loss='binary_crossentropy')

    model_path = DATA_ROOT + model_name + '_weights_' + FUNCTION + '.pkl'
    logging.info('Compilation finished in %d sec' % (time.time() - start_time))
    logging.info('Loading weights')
    load_model_weights(model, model_path)

    logging.info('Predicting')
    preds = model.predict_generator(data_generator,
                                    val_samples=len(data[0]),
                                    nb_worker=12)
    for i in xrange(len(preds)):
        preds[i] = preds[i].reshape(-1, 1)
    preds = np.concatenate(preds, axis=1)

    incon = 0
    for i in xrange(len(data)):
        for j in xrange(len(functions)):
            anchestors = get_anchestors(go, functions[j])
            for p_id in anchestors:
                if (p_id not in [GO_ID, functions[j]]
                        and preds[i, go_indexes[p_id]] < preds[i, j]):
                    incon += 1
                    preds[i, go_indexes[p_id]] = preds[i, j]
    logging.info('Inconsistent predictions: %d' % incon)

    predictions = list()
    for i in xrange(len(targets)):
        predictions.append(preds[i])
    df = pd.DataFrame({'targets': targets, 'predictions': predictions})
    print(len(df))
    df.to_pickle(DATA_ROOT + model_name + '_preds_' + FUNCTION + '.pkl')
    logging.info('Done in %d sec' % (time.time() - start_time))
示例#5
0
def run_model(model_path='data/model.h5', batch_size=128):
    model = load_model(model_path)
    prots, data = load_data()
    data_generator = DataGenerator(batch_size)
    data_generator.fit(data)
    # Features layer model
    model = model.layers[1]
    steps = math.ceil(data.shape[0] / batch_size)
    output = model.predict_generator(data_generator, steps=steps, verbose=1)
    print(output)
    vectors = list()
    for i in range(output.shape[0]):
        vectors.append(output[i, :])
    df = pd.DataFrame({'proteins': prots, 'vectors': vectors})
    df.to_pickle('data/vectors.pkl')
示例#6
0
def model(model_name):
    # set parameters:
    batch_size = 128
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    data, targets = load_data()
    data_generator = DataGenerator(batch_size, nb_classes)
    data_generator.fit(data, None)

    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Data size: %d" % len(data[0]))
    logging.info('Loading the model')
    model = load_model(DATA_ROOT + model_name + '_' + FUNCTION + '.h5')

    logging.info('Predicting')
    preds = model.predict_generator(data_generator, val_samples=len(data[0]))

    # incon = 0
    # for i in xrange(len(data)):
    #    for j in xrange(len(functions)):
    #         anchestors = get_anchestors(go, functions[j])
    #         for p_id in anchestors:
    #             if (p_id not in [GO_ID, functions[j]] and
    #                     preds[i, go_indexes[p_id]] < preds[i, j]):
    #                 incon += 1
    #                 preds[i, go_indexes[p_id]] = preds[i, j]
    # logging.info('Inconsistent predictions: %d' % incon)

    predictions = list()
    for i in xrange(len(targets)):
        predictions.append(preds[i])
    df = pd.DataFrame({'targets': targets, 'predictions': predictions})
    print(len(df))
    df.to_pickle(DATA_ROOT + model_name + '_preds_' + FUNCTION + '.pkl')
    logging.info('Done in %d sec' % (time.time() - start_time))
示例#7
0
def train(data, batch_size, epochs, model_file, validation_split=0.8):
    index = np.arange(data.shape[0])
    train_n = int(data.shape[0] * validation_split)
    train_data, valid_data = data[index[:train_n], :], data[index[train_n:], :]
    train_generator = DataGenerator(batch_size)
    train_generator.fit(train_data, train_data)
    valid_generator = DataGenerator(batch_size)
    valid_generator.fit(valid_data, valid_data)
    steps = int(math.ceil(train_n / batch_size))
    valid_n = data.shape[0] - train_n
    valid_steps = int(math.ceil(valid_n / batch_size))

    checkpointer = ModelCheckpoint(filepath=model_file,
                                   verbose=1,
                                   save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=100, verbose=1)

    model = build_model()
    model.fit_generator(train_generator,
                        steps_per_epoch=steps,
                        epochs=epochs,
                        validation_data=valid_generator,
                        validation_steps=valid_steps,
                        callbacks=[earlystopper, checkpointer])
示例#8
0
def model(batch_size=128, nb_epoch=100, is_train=True):
    # set parameters:
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    train, val, test, train_df, valid_df, test_df = load_data()
    train_df = pd.concat([train_df, valid_df])
    test_gos = test_df['gos'].values
    train_data, train_labels = train
    val_data, val_labels = val
    test_data, test_labels = test
    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Training data size: %d" % len(train_data))
    logging.info("Validation data size: %d" % len(val_data))
    logging.info("Test data size: %d" % len(test_data))

    model_path = DATA_ROOT + 'models/model_seq_' + FUNCTION + '.h5'
    checkpointer = ModelCheckpoint(filepath=model_path,
                                   verbose=1,
                                   save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

    logging.info('Starting training the model')

    train_generator = DataGenerator(batch_size, nb_classes)
    train_generator.fit(train_data, train_labels)
    valid_generator = DataGenerator(batch_size, nb_classes)
    valid_generator.fit(val_data, val_labels)
    test_generator = DataGenerator(batch_size, nb_classes)
    test_generator.fit(test_data, test_labels)

    if is_train:
        model = get_model()
        model.fit_generator(train_generator,
                            samples_per_epoch=len(train_data),
                            nb_epoch=nb_epoch,
                            validation_data=valid_generator,
                            nb_val_samples=len(val_data),
                            max_q_size=batch_size,
                            callbacks=[checkpointer, earlystopper])

    logging.info('Loading best model')
    model = load_model(model_path)

    model = model.layers[1]
    output = model.predict_generator(test_generator,
                                     val_samples=len(test_data))
    print((output.shape))
    return
    logging.info('Predicting')
    preds = model.predict_generator(test_generator, val_samples=len(test_data))
    # incon = 0
    # for i in range(len(test_data)):
    #     for j in range(len(functions)):
    #         childs = set(go[functions[j]]['children']).intersection(func_set)
    #         ok = True
    #         for n_id in childs:
    #             if preds[i, j] < preds[i, go_indexes[n_id]]:
    #                 preds[i, j] = preds[i, go_indexes[n_id]]
    #                 ok = False
    #         if not ok:
    #             incon += 1
    logging.info('Computing performance')
    f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos)
    roc_auc = compute_roc(preds, test_labels)
    mcc = compute_mcc(preds_max, test_labels)
    logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
    logging.info('ROC AUC: \t %f ' % (roc_auc, ))
    logging.info('MCC: \t %f ' % (mcc, ))
    print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (f, p, r, roc_auc, mcc)))
    # logging.info('Inconsistent predictions: %d' % incon)
    # logging.info('Saving the predictions')
    # proteins = test_df['proteins']
    # predictions = list()
    # for i in range(preds_max.shape[0]):
    #     predictions.append(preds_max[i])
    # df = pd.DataFrame(
    #     {
    #         'proteins': proteins, 'predictions': predictions,
    #         'gos': test_df['gos'], 'labels': test_df['labels']})
    # df.to_pickle(DATA_ROOT + 'test-' + FUNCTION + '-predictions.pkl')
    # logging.info('Done in %d sec' % (time.time() - start_time))

    function_centric_performance(functions, preds.T, test_labels.T)
示例#9
0
def model(batch_size=128, nb_epoch=100):
    # set parameters:
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    train, val, test, train_df, valid_df, test_df = load_data()
    train_df = pd.concat([train_df, valid_df])
    test_gos = test_df['gos'].values
    train_data, train_labels = train
    val_data, val_labels = val
    test_data, test_labels = test
    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Training data size: %d" % len(train_data[0]))
    logging.info("Validation data size: %d" % len(val_data[0]))
    logging.info("Test data size: %d" % len(test_data[0]))

    # pre_model_path = DATA_ROOT + 'pre_model_weights_' + FUNCTION + '.pkl'
    model_path = DATA_ROOT + 'model_' + FUNCTION + '.h5'
    checkpointer = ModelCheckpoint(filepath=model_path,
                                   verbose=1,
                                   save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

    logging.info('Starting training the model')

    train_generator = DataGenerator(batch_size, nb_classes)
    train_generator.fit(train_data, train_labels)
    valid_generator = DataGenerator(batch_size, nb_classes)
    valid_generator.fit(val_data, val_labels)
    test_generator = DataGenerator(batch_size, nb_classes)
    test_generator.fit(test_data, test_labels)

    # model = get_model()
    # model.fit_generator(
    #     train_generator,
    #     samples_per_epoch=len(train_data[0]),
    #     nb_epoch=nb_epoch,
    #     validation_data=valid_generator,
    #     nb_val_samples=len(val_data[0]),
    #     max_q_size=batch_size,
    #     callbacks=[checkpointer, earlystopper])

    logging.info('Loading best model')
    model = load_model(model_path)

    logging.info('Predicting')
    preds = model.predict_generator(test_generator,
                                    val_samples=len(test_data[0]))
    # incon = 0
    # for i in range(len(test_data)):
    #     for j in range(len(functions)):
    #         childs = set(go[functions[j]]['children']).intersection(func_set)
    #         ok = True
    #         for n_id in childs:
    #             if preds[i, j] < preds[i, go_indexes[n_id]]:
    #                 preds[i, j] = preds[i, go_indexes[n_id]]
    #                 ok = False
    #         if not ok:
    #             incon += 1
    logging.info('Computing performance')
    f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos)
    roc_auc = compute_roc(preds, test_labels)
    logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
    logging.info('ROC AUC: \t %f ' % (roc_auc, ))
def model():
    # set parameters:
    batch_size = 128
    nb_epoch = 100
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    train, val, test, train_df, valid_df, test_df = load_data()
    train_df = pd.concat([train_df, valid_df])
    test_gos = test_df['gos'].values
    train_data, train_labels = train
    val_data, val_labels = val
    test_data, test_labels = test
    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Training data size: %d" % len(train_data[0]))
    logging.info("Validation data size: %d" % len(val_data[0]))
    logging.info("Test data size: %d" % len(test_data[0]))

    pre_model_path = DATA_ROOT + 'pre_model_weights_' + FUNCTION + '.pkl'
    model_path = DATA_ROOT + 'model_weights_' + FUNCTION + '.pkl'
    last_model_path = DATA_ROOT + 'model_weights_' + FUNCTION + '.last.pkl'
    checkpointer = MyCheckpoint(filepath=model_path,
                                verbose=1,
                                save_best_only=True,
                                save_weights_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

    model = get_model()
    # logging.info('Loading pretrained weights')
    # load_model_weights(model, pre_model_path)

    logging.info('Starting training the model')

    train_generator = DataGenerator(batch_size, nb_classes)
    train_generator.fit(train_data, train_labels)
    valid_generator = DataGenerator(batch_size, nb_classes)
    valid_generator.fit(val_data, val_labels)
    test_generator = DataGenerator(batch_size, nb_classes)
    test_generator.fit(test_data, test_labels)

    # model.fit_generator(
    #     train_generator,
    #     samples_per_epoch=len(train_data[0]),
    #     nb_epoch=nb_epoch,
    #     validation_data=valid_generator,
    #     nb_val_samples=len(val_data[0]),
    #     max_q_size=batch_size,
    #     callbacks=[checkpointer, earlystopper])

    logging.info('Loading weights')
    load_model_weights(model, model_path)
    model.save(DATA_ROOT + 'model_%s.h5' % FUNCTION)
    logging.info('Predicting')
    preds = model.predict_generator(test_generator,
                                    val_samples=len(test_data[0]))
    # incon = 0
    # for i in xrange(len(test_data)):
    #     for j in xrange(len(functions)):
    #         childs = set(go[functions[j]]['children']).intersection(func_set)
    #         ok = True
    #         for n_id in childs:
    #             if preds[i, j] < preds[i, go_indexes[n_id]]:
    #                 preds[i, j] = preds[i, go_indexes[n_id]]
    #                 ok = False
    #         if not ok:
    #             incon += 1
    logging.info('Computing performance')
    f, p, r, preds_max = compute_performance(preds, test_labels, test_gos)
    # roc_auc = compute_roc(preds, test_labels)
    # logging.info('Fmax measure: \t %f %f %f' % (f, p, r))
    # logging.info('ROC AUC: \t %f ' % (roc_auc, ))
    # logging.info('Inconsistent predictions: %d' % incon)
    logging.info('Saving the predictions')
    proteins = test_df['proteins']
    predictions = list()
    for i in xrange(preds_max.shape[0]):
        predictions.append(preds_max[i])
    df = pd.DataFrame({
        'proteins': proteins,
        'predictions': predictions,
        'gos': test_df['gos'],
        'labels': test_df['labels']
    })
    df.to_pickle(DATA_ROOT + 'test-' + FUNCTION + '-predictions.pkl')
    logging.info('Done in %d sec' % (time.time() - start_time))
示例#11
0
def model(params, batch_size=128, nb_epoch=6, is_train=True):
    # set parameters:
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    train, val, test, train_df, valid_df, test_df = load_data()
    train_df = pd.concat([train_df, valid_df])
    test_gos = test_df['gos'].values
    train_data, train_labels = train
    val_data, val_labels = val
    test_data, test_labels = test
    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Training data size: %d" % len(train_data[0]))
    logging.info("Validation data size: %d" % len(val_data[0]))
    logging.info("Test data size: %d" % len(test_data[0]))

    model_path = (DATA_ROOT + 'models/model_' + FUNCTION + '.h5') 
                  # '-' + str(params['embedding_dims']) +
                  # '-' + str(params['nb_filter']) +
                  # '-' + str(params['nb_conv']) +
                  # '-' + str(params['nb_dense']) + '.h5')
    checkpointer = ModelCheckpoint(
        filepath=model_path,
        verbose=1, save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

    logging.info('Starting training the model')

    train_generator = DataGenerator(batch_size, nb_classes)
    train_generator.fit(train_data, train_labels)
    valid_generator = DataGenerator(batch_size, nb_classes)
    valid_generator.fit(val_data, val_labels)
    test_generator = DataGenerator(batch_size, nb_classes)
    test_generator.fit(test_data, test_labels)

    if is_train:
        model = get_model(params)
        model.fit_generator(
            train_generator,
            samples_per_epoch=len(train_data[0]),
            nb_epoch=nb_epoch,
            validation_data=valid_generator,
            nb_val_samples=len(val_data[0]),
            max_q_size=batch_size,
            callbacks=[checkpointer, earlystopper])
    logging.info('Loading best model')
    start_time = time.time()
    model = load_model(model_path)
    logging.info('Loading time: %d' % (time.time() - start_time))
    # orgs = ['9606', '10090', '10116', '7227', '7955',
    #         '559292', '3702', '284812', '6239',
    #         '83333', '83332', '224308', '208964']
    # for org in orgs:
    #     logging.info('Predicting for %s' % (org,))
    #     train, val, test, train_df, valid_df, test_df = load_data(org=org)
    #     test_data, test_labels = test
    #     test_gos = test_df['gos'].values
    #     test_generator = DataGenerator(batch_size, nb_classes)
    #     test_generator.fit(test_data, test_labels)
    start_time = time.time()
    preds = model.predict_generator(
        test_generator, val_samples=len(test_data[0]))
    running_time = time.time() - start_time
    logging.info('Running time: %d %d' % (running_time, len(test_data[0])))
    logging.info('Computing performance')
    f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos)
    roc_auc = compute_roc(preds, test_labels)
    mcc = compute_mcc(preds_max, test_labels)
    logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
    logging.info('ROC AUC: \t %f ' % (roc_auc, ))
    logging.info('MCC: \t %f ' % (mcc, ))
    print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (
        f, p, r, roc_auc, mcc)))
    # return f
    # logging.info('Inconsistent predictions: %d' % incon)
    # logging.info('Saving the predictions')
    proteins = test_df['proteins']
    predictions = list()
    for i in range(preds_max.shape[0]):
        predictions.append(preds_max[i])
    df = pd.DataFrame(
        {
            'proteins': proteins, 'predictions': predictions,
            'gos': test_df['gos'], 'labels': test_df['labels']})
    df.to_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds.pkl')
示例#12
0
def model(params, batch_size=b_size, nb_epoch=n_epoch, is_train=True):
    # set parameters:
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    train, val, test, train_df, valid_df, test_df = load_data()
    print len(test_df)
    train_df = pd.concat([train_df, valid_df])
    test_gos = test_df['gos'].values
    train_data, train_labels = train
    val_data, val_labels = val
    test_data, test_labels = test
    print len(test_labels)
    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Training data size: %d" % len(train_data[0]))
    logging.info("Validation data size: %d" % len(val_data[0]))
    logging.info("Test data size: %d" % len(test_data[0]))

    model_path = (DATA_ROOT + 'models/model_' + FUNCTION + '.h5')
    checkpointer = ModelCheckpoint(filepath=model_path,
                                   verbose=1,
                                   save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

    logging.info('Starting training the model')
    print train_data
    train_generator = DataGenerator(batch_size, nb_classes)
    train_generator.fit(train_data, train_labels)
    valid_generator = DataGenerator(batch_size, nb_classes)
    valid_generator.fit(val_data, val_labels)
    test_generator = DataGenerator(batch_size, nb_classes)
    test_generator.fit(test_data, test_labels)
    is_train = True
    if is_train:
        model = get_model(params)
        model.fit_generator(train_generator,
                            samples_per_epoch=len(train_data[0]),
                            nb_epoch=nb_epoch,
                            validation_data=valid_generator,
                            nb_val_samples=len(val_data[0]),
                            max_q_size=batch_size,
                            callbacks=[checkpointer, earlystopper])
    logging.info('Loading best model')
    start_time = time.time()
    model = load_model(model_path)
    logging.info('Loading time: %d' % (time.time() - start_time))
    start_time = time.time()
    preds = model.predict_generator(test_generator,
                                    val_samples=len(test_data[0]))
    running_time = time.time() - start_time
    logging.info('Running time: %d %d' % (running_time, len(test_data[0])))
    logging.info('Computing performance')
    # pred_file="pred"+FUNCTION+".txt"
    # test_file ="test"+FUNCTION+".txt"
    # gos_file = "test"+FUNCTION+"_goc.txt"
    # write_file(pred_file,preds)
    # write_file(test_file,test_labels)
    # write_file(gos_file,test_gos)
    f, p, r, t, preds_max = compute_performance(preds, test_labels, test_gos)
    roc_auc = compute_roc(preds, test_labels)
    mcc = compute_mcc(preds_max, test_labels)
    logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
    logging.info('ROC AUC: \t %f ' % (roc_auc, ))
    logging.info('MCC: \t %f ' % (mcc, ))
    print('f :%.3f & p: %.3f & r: %.3f & roc_auc: %.3f & mcc: %.3f' %
          (f, p, r, roc_auc, mcc))
    write_results([f, p, r, roc_auc, mcc])
    proteins = test_df['proteins']
    predictions = list()
    for i in xrange(preds_max.shape[0]):
        predictions.append(preds_max[i])
    df = pd.DataFrame({
        'proteins': proteins,
        'predictions': predictions,
        'gos': test_df['gos'],
        'labels': test_df['labels']
    })
    print df
    df.to_pickle('test' + FUNCTION + 'preds.pkl')
示例#13
0
def train_model(batch_size=128,
                epochs=100,
                is_train=True,
                model_path='data/model.h5'):
    # set parameters:
    start_time = time.time()
    logging.info("Loading Data")
    train, valid, test = load_data()
    train_data, train_labels = train
    valid_data, valid_labels = valid
    test_data, test_labels = test

    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Training data size: %d" % train_data.shape[0])
    logging.info("Validation data size: %d" % valid_data.shape[0])
    logging.info("Test data size: %d" % test_data.shape[0])

    model_path = 'data/model.h5'
    checkpointer = ModelCheckpoint(filepath=model_path,
                                   verbose=1,
                                   save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

    logging.info('Starting training the model')

    train_generator = DataGenerator(batch_size)
    train_generator.fit(train_data, train_labels)
    valid_generator = DataGenerator(batch_size)
    valid_generator.fit(valid_data, valid_labels)
    test_generator = DataGenerator(batch_size)
    test_generator.fit(test_data, test_labels)

    if is_train:
        valid_steps = int(math.ceil(valid_data.shape[0] / batch_size))
        train_steps = int(math.ceil(train_data.shape[0] / batch_size))
        model = get_model()
        model.fit_generator(train_generator,
                            steps_per_epoch=train_steps,
                            epochs=epochs,
                            validation_data=valid_generator,
                            validation_steps=valid_steps,
                            max_queue_size=batch_size,
                            workers=12,
                            callbacks=[checkpointer, earlystopper])

    logging.info('Loading best model')
    model = load_model(model_path)

    logging.info('Predicting')
    test_steps = int(math.ceil(test_data.shape[0] / batch_size))
    preds = model.predict_generator(test_generator,
                                    steps=test_steps,
                                    verbose=1)

    logging.info('Computing performance')
    test_labels = test_labels.toarray()
    f, p, r, t, preds_max = compute_performance(preds, test_labels)
    roc_auc = compute_roc(preds, test_labels)
    mcc = compute_mcc(preds_max, test_labels)
    logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
    logging.info('ROC AUC: \t %f ' % (roc_auc, ))
    logging.info('MCC: \t %f ' % (mcc, ))
    print('%.3f & %.3f & %.3f & %.3f & %.3f' % (f, p, r, roc_auc, mcc))
示例#14
0
def model():
    # set parameters:
    batch_size = 128
    nb_epoch = 100
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    train, val, test, test_df = load_data()
    test_gos = test_df['gos'].values
    train_data, train_labels = train
    val_data, val_labels = val
    test_data, test_labels = test
    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Training data size: %d" % len(train_data))
    logging.info("Validation data size: %d" % len(val_data))
    logging.info("Test data size: %d" % len(test_data))
    logging.info("Building the model")
    inputs = Input(shape=(MAXLEN, ), dtype='int32', name='input1')
    feature_model = get_feature_model()(inputs)
    layers = get_layers(feature_model)
    output_models = []
    for i in range(len(functions)):
        output_models.append(layers[functions[i]]['output'])
    net = merge(output_models, mode='concat', concat_axis=1)
    # net = Dense(nb_classes * 2, activation='relu')(feature_model)
    # net = Dense(nb_classes, activation='sigmoid')(net)
    # net = Activation('sigmoid')(net)
    model = Model(input=inputs, output=net)
    logging.info('Model built in %d sec' % (time.time() - start_time))
    logging.info('Saving the model')
    model_json = model.to_json()
    with open(DATA_ROOT + 'model_seq_' + FUNCTION + '.json', 'w') as f:
        f.write(model_json)
    logging.info('Compiling the model')
    optimizer = RMSprop()

    model.compile(optimizer=optimizer, loss='binary_crossentropy')

    pre_model_path = DATA_ROOT + 'pre_model_seq_weights_' + FUNCTION + '.pkl'
    model_path = DATA_ROOT + 'model_seq_weights_' + FUNCTION + '.pkl'
    checkpointer = MyCheckpoint(filepath=model_path,
                                verbose=1,
                                save_best_only=True,
                                save_weights_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
    logging.info('Compilation finished in %d sec' % (time.time() - start_time))

    # logging.info('Loading pretrained weights')
    # load_model_weights(model, pre_model_path)

    logging.info('Starting training the model')

    train_generator = DataGenerator(batch_size, nb_classes)
    train_generator.fit(train_data, train_labels)
    valid_generator = DataGenerator(batch_size, nb_classes)
    valid_generator.fit(val_data, val_labels)
    test_generator = DataGenerator(batch_size, nb_classes)
    test_generator.fit(test_data, test_labels)
    # model.fit_generator(
    #     train_generator,
    #     samples_per_epoch=len(train_data),
    #     nb_epoch=nb_epoch,
    #     validation_data=valid_generator,
    #     nb_val_samples=len(val_data),
    #     max_q_size=batch_size,
    #     callbacks=[checkpointer, earlystopper])

    logging.info('Loading weights')
    load_model_weights(model, model_path)

    # model.save(DATA_ROOT + 'model_%s.h5' % FUNCTION)

    preds = model.predict_generator(test_generator, val_samples=len(test_data))

    logging.info(preds.shape)
    incon = 0
    # for i in xrange(len(test_data)):
    #     for j in xrange(len(functions)):
    #         childs = set(go[functions[j]]['children']).intersection(func_set)
    #         ok = True
    #         for n_id in childs:
    #             if preds[i, j] < preds[i, go_indexes[n_id]]:
    #                 preds[i, j] = preds[i, go_indexes[n_id]]
    #                 ok = False
    #         if not ok:
    #             incon += 1
    f, p, r, preds_max = compute_performance(preds, test_labels, test_gos)
    roc_auc = compute_roc(preds, test_labels)
    logging.info('Fmax measure: \t %f %f %f' % (f, p, r))
    logging.info('ROC AUC: \t %f ' % (roc_auc, ))
    logging.info('Inconsistent predictions: %d' % incon)
    logging.info('Saving the predictions')
    proteins = test_df['proteins']
    predictions = list()
    for i in xrange(preds_max.shape[0]):
        predictions.append(preds_max[i])
    df = pd.DataFrame({
        'proteins': proteins,
        'predictions': predictions,
        'gos': test_df['gos'],
        'labels': test_df['labels']
    })
    df.to_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds-seq.pkl')
    logging.info('Done in %d sec' % (time.time() - start_time))

    function_centric_performance(functions, preds.T, test_labels.T)
示例#15
0
def model(data, output_file, validation_split=0.9):

    #load data from  xx into hidde
    md = open('neorons.txt', 'r')
    line = md.readline()
    print line
    cnmd = open('layer.txt', 'r')
    l = cnmd.readline()

    cnm = arg_as_list(line)
    hidden_neurons = cnm
    hidden_layers = int(l)
    train_n = int(validation_split * len(data))
    batch_size = 50
    e = 50
    m = data.shape[1]

    if hidden_neurons[0] == 0:
        train_data = data
        val_data = data
        input_sh = Input(shape=(data.shape[1], ))
        encoded = noise.GaussianNoise(0.2)(input_sh)

    elif hidden_neurons[0] == 1:
        train_data = data[:train_n, :]
        val_data = data[train_n:, :]
        '''train_data = np.expand_dims(train_data, axis=0)
		val_data = np.expand_dims(val_data, axis=0)'''
        #train_data=tf.reshape(train_data,(train_n,data.shape[1],1))
        input_sh = Input(shape=(data.shape[1], ))
        input_sh1 = Reshape((data.shape[1], 1))(input_sh)
        encoded = noise.GaussianNoise(0.2)(input_sh1)

    else:
        train_data = data[:train_n, :]
        val_data = data[train_n:, :]
        input_sh = Input(shape=(data.shape[1], ))
        encoded = noise.GaussianNoise(0.2)(input_sh)

    print 'ghhhhhhhhhhhhhhh                  ', encoded.ndim
    print hidden_neurons
    print 'layer    ', hidden_layers
    if hidden_neurons[0] == 0:
        batch_size = len(data)
        encoded = GraphConvolution(
            input_dim=data.shape[1],
            output_dim=hidden_neurons[1],
            support=input_sh,
            act=tf.nn.relu,
        )(encoded)
    elif hidden_neurons[0] == 1:
        encoded = Conv1D(filters=1,
                         kernel_size=178 - hidden_neurons[1],
                         activation='relu')(encoded)
        encoded = Flatten()(encoded)
        print 'ghhhhhhhhhhhhhhh                  ', encoded.ndim
    elif hidden_neurons[0] == 2:
        encoded = Dense(hidden_neurons[1], activation='relu')(encoded)
    encoded = noise.GaussianNoise(0.2)(encoded)

    for i in range(2, hidden_layers):
        print i, hidden_neurons[i]
        print 'ghhhhhhhhhhhhhhh dense                 ', encoded.ndim
        encoded = Dense(hidden_neurons[i], activation='relu')(encoded)
        encoded = noise.GaussianNoise(0.2)(encoded)

    decoded = Dense(hidden_neurons[-2], activation='relu')(encoded)

    print hidden_neurons[-2]
    for j in range(hidden_layers - 3, 0, -1):
        print 'jhjjjjj          ', j, hidden_neurons[j]
        decoded = Dense(hidden_neurons[j], activation='relu')(decoded)
        print data.shape[1]
    decoded = Dense(data.shape[1], activation='sigmoid')(decoded)
    autoencoder = Model(inputs=input_sh, outputs=decoded)  #bp to train weights
    autoencoder.compile(optimizer='adadelta', loss='mse')

    #checkpointer = ModelCheckpoint(filepath='bestmodel' + output_file + ".hdf5", verbose=1, save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=15, verbose=1)
    if hidden_neurons[0] == 1:
        train_data1 = data[:train_n, :]
        val_data1 = data[train_n:, :]
        train_generator = DataGenerator(batch_size)
        print 'dfffffffffffffff', train_data.shape
        print 'dfffffffffffffff', train_data1.shape
        train_generator.fit(train_data, train_data)

        val_generator = DataGenerator(batch_size)
        val_generator.fit(val_data, val_data)
    else:

        train_generator = DataGenerator(batch_size)
        train_generator.fit(train_data, train_data)

        val_generator = DataGenerator(batch_size)
        val_generator.fit(val_data, val_data)

    h = autoencoder.fit_generator(
        train_generator,
        steps_per_epoch=len(data) / batch_size,
        epochs=e,
        validation_data=val_generator,
        validation_steps=len(data),
        callbacks=[earlystopper],
    )
    #print 'jkjjkjkkjjk   ', h.history
    avge = h.history['val_loss'][e - 1]
    print 'avge    ', avge
    enco = Model(inputs=input_sh,
                 outputs=encoded)  #just select the first embedding part
    enco.compile(optimizer='adadelta', loss='mse')  #configuration ?
    reprsn = enco.predict(data, batch_size=batch_size)
    return reprsn, avge