예제 #1
0
def human_go_annotations():
    go = get_gene_ontology()
    annots = {}
    df = pd.read_pickle('data/cafa3/swissprot_exp.pkl')
    for i, row in df.iterrows():
        acc = row['accessions']
        gos = set()
        for go_id in row['annots']:
            go_id = go_id.split('|')
            if go_id[1] in EXP_CODES and go_id[0] in go:
                gos.add(go_id[0])
        if len(gos) > 0:
            annots[acc] = gos
    id_df = pd.read_pickle('data/idmapping.9606.pkl')
    st_ids = dict()
    for i, row in id_df.iterrows():
        if isinstance(row['string'], str):
            st_ids[row['accessions']] = row['string']
    with open('data/human_annotations.tab', 'w') as f:
        for acc, gos in annots.iteritems():
            if acc in st_ids:
                f.write(st_ids[acc])
                for go_id in gos:
                    f.write('\t' + go_id)
                f.write('\n')
예제 #2
0
파일: plots.py 프로젝트: oeway/deepgo
def table():
    bp = get_data('bp.res')
    mf = get_data('mf.res')
    cc = get_data('cc.res')
    bp_seq = get_data('bp-seq.res')
    mf_seq = get_data('mf-seq.res')
    cc_seq = get_data('cc-seq.res')
    go = get_gene_ontology('go.obo')
    gos = go[BIOLOGICAL_PROCESS]['children']
    res = list()
    for go_id in gos:
        if go_id in bp:
            res.append((go_id, go[go_id]['name'], bp[go_id][0], bp[go_id][1],
                        bp_seq[go_id][0], bp_seq[go_id][1]))
    for row in sorted(res, key=lambda x: x[2], reverse=True):
        print('%s & %s & %f & %f & %f & %f \\\\' % row)
    gos = go[MOLECULAR_FUNCTION]['children']
    print()
    res = list()
    for go_id in gos:
        if go_id in mf:
            res.append((go_id, go[go_id]['name'], mf[go_id][0], mf[go_id][1],
                        mf_seq[go_id][0], mf_seq[go_id][1]))
    for row in sorted(res, key=lambda x: x[2], reverse=True):
        print('%s & %s & %f & %f & %f & %f \\\\' % row)
    gos = go[CELLULAR_COMPONENT]['children']
    print()
    res = list()
    for go_id in gos:
        if go_id in cc:
            res.append((go_id, go[go_id]['name'], cc[go_id][0], cc[go_id][1],
                        cc_seq[go_id][0], cc_seq[go_id][1]))
    for row in sorted(res, key=lambda x: x[2], reverse=True):
        print('%s & %s & %f & %f & %f & %f \\\\' % row)
예제 #3
0
def main(function, device, org, train):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        model(is_train=train)
예제 #4
0
def main(split):
    global SPLIT
    SPLIT = split
    global GO_IDS
    GO_IDS = list(FUNC_DICT.values())
    global go
    go = get_gene_ontology('go.obo')
    func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl')
    global functions
    functions = func_df['functions'].values
    func_df = pd.read_pickle(DATA_ROOT + 'mf.pkl')
    functions = np.concatenate((functions, func_df['functions'].values))
    func_df = pd.read_pickle(DATA_ROOT + 'cc.pkl')
    functions = np.concatenate((functions, func_df['functions'].values))
    global func_set
    func_set = (
        get_go_set(go, GO_IDS[0])
        | get_go_set(go, GO_IDS[1])
        | get_go_set(go, GO_IDS[2]))
    print(len(functions))
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    run()
def get_annotations():
    gene_ontology = get_gene_ontology()
    annots = dict()
    gene_name = dict()
    with open(DATA_ROOT + 'gene_association.sgd', 'r') as f:
        for line in f:
            if line[0] == '!':
                continue
            items = line.strip().split('\t')
            if items[3] == 'NOT' or items[6] == 'ND':
                continue
            gene_id = items[1]
            gene_name[gene_id] = items[2]
            go = items[4]
            if gene_id not in annots:
                annots[gene_id] = set()
            if go in gene_ontology:
                annots[gene_id].add(go)
    groups = dict()
    for gene_id, gos in annots.iteritems():
        l = len(gos)
        if l not in groups:
            groups[l] = list()
        groups[l].append((list(gos), gene_id))
    with open(DATA_ROOT + 'sgd_annotations_genes2.txt', 'w') as f:
        for group in sorted(groups.keys()):
            gos_list = groups[group]
            print group
            for gos, gene_id in gos_list:
                f.write(gene_name[gene_id] + '\t')
                f.write(gos[0])
                for go in gos[1:]:
                    f.write('\t' + go)
                f.write('\n')
예제 #6
0
def main(function, device, org, train, param, embeddingmethod, shuffleseed,
         buildmethod, evomodel, cached):
    global CACHED
    CACHED = cached
    global BUILDMETHOD
    BUILDMETHOD = buildmethod
    global EVOMODEL
    EVOMODEL = evomodel
    global EMBEDDINGMETHOD
    EMBEDDINGMETHOD = embeddingmethod
    global SEED
    SEED = shuffleseed
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    global experiment_id
    experiment_id = str(function) + '-' + str(embeddingmethod) + '-' + str(
        shuffleseed) + '-' + str(buildmethod) + '-' + str(evomodel)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    a = experiment_id
    global resdir
    resdir = "results/" + experiment_id
    if not os.path.isdir(resdir):
        os.mkdir(resdir)
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 3,
            'nb_dense': 2,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        # model(params, is_train=train)
        model(params, is_train=True)
예제 #7
0
파일: blast.py 프로젝트: yuanenming/deepgo
def compute_performance(func):
    go = get_gene_ontology()
    train_df = pd.read_pickle('data/swissexp/train-' + func + '.pkl')
    test_df = pd.read_pickle('data/swissexp/test-' + func + '.pkl')

    train_labels = {}
    test_labels = {}
    for i, row in train_df.iterrows():
        go_set = set()
        for go_id in row['gos']:
            if go_id in go:
                go_set |= get_anchestors(go, go_id)
        train_labels[row['proteins']] = row['labels']

    for i, row in test_df.iterrows():
        go_set = set()
        for go_id in row['gos']:
            if go_id in go:
                go_set |= get_anchestors(go, go_id)
        test_labels[row['proteins']] = row['labels']

    preds = list()
    test = list()
    with open('data/swissexp/blast-' + func + '.res') as f:
        for line in f:
            it = line.strip().split('\t')
            preds.append(train_labels[it[1]])
            test.append(test_labels[it[0]])

    total = 0
    p = 0.0
    r = 0.0
    f = 0.0
    p_total = 0
    for label, pred in zip(test, preds):
        tp = np.sum(label * pred)
        fp = np.sum(pred) - tp
        fn = np.sum(label) - tp
        # tp = len(label.intersection(pred))
        # fp = len(pred) - tp
        # fn = len(label) - tp

        if tp == 0 and fp == 0 and fn == 0:
            continue
        total += 1
        if tp != 0:
            p_total += 1
            precision = tp / (1.0 * (tp + fp))
            recall = tp / (1.0 * (tp + fn))
            p += precision
            r += recall
    p /= p_total
    r /= total
    f = 2 * p * r / (p + r)
    return f, p, r
예제 #8
0
def main(function, device, org, train, param):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 3,
            'nb_dense': 2,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        model(params, is_train=train)
        dims = [64, 128, 256, 512]
        nb_filters = [16, 32, 64, 128]
        nb_convs = [1, 2, 3, 4]
        nb_dense = [1, 2, 3, 4]
        for i in range(param * 32, param * 32 + 32):
            dim = i % 4
            i = i / 4
            nb_fil = i % 4
            i /= 4
            conv = i % 4
            i /= 4
            den = i
            params['embedding_dims'] = dims[dim]
            params['nb_filter'] = nb_filters[nb_fil]
            params['nb_conv'] = nb_convs[conv]
            params['nb_dense'] = nb_dense[den]
예제 #9
0
def get_real_annotations():
    go = get_gene_ontology()
    df = pd.read_pickle('data/cafa3/swissprot_exp.pkl')
    annots = {}
    for i, row in df.iterrows():
        go_set = set()
        for go_id in row['annots']:
            go_id = go_id.split('|')
            if go_id[0] in go and go_id[1] in EXP_CODES:
                go_set |= get_anchestors(go, go_id[0])
        annots[row['proteins']] = go_set
    return annots
def main(function, test_df, device):
    org = None
    param = 0
    filename = 'ResultSequenceStructPPI.txt'
    train = False
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    global go_indexes
    go_indexes = dict()
    #will be used for my prediction list
    indexes_for_prediction = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
        indexes_for_prediction[ind] = go_id
    global node_names
    global FILENAME
    FILENAME = filename
    global PARAMS
    node_names = set()
    global prediction_list
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 1,
            'nb_dense': 1,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        PARAMS = params
        prediction_list = model(params, test_df, is_train=train)
    return prediction_list
예제 #11
0
def main(data_root, go_filename, go_domain, split):
    global DATA_ROOT
    DATA_ROOT = data_root
    global go
    go = get_gene_ontology()
    global FUNCTION
    FUNCTION = go_domain
    df = pd.read_pickle(DATA_ROOT + go_domain + '.pkl')
    global functions
    functions = list(df['functions'])
    global func_set
    func_set = set(functions)
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    dataset = load_data(split=split)
    train_model(dataset)
예제 #12
0
def specific_predictions():
    root = 'data/cafa3/'
    go = get_gene_ontology()
    fw = open(root + 'test_predictions_specific.tab', 'w')
    with open(root + 'test_predictions.tab', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            go_set = set(items[1:])
            gos = go_set.copy()
            for go_id in gos:
                anchestors = get_anchestors(go, go_id)
                anchestors.remove(go_id)
                go_set -= anchestors
            fw.write(items[0])
            for go_id in go_set:
                fw.write('\t' + go_id)
            fw.write('\n')
    fw.close()
예제 #13
0
def main(function, device, org, train, param, filename):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    global FILENAME
    FILENAME = filename
    global PARAMS
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 1,
            'nb_dense': 1,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        PARAMS = params
        model(params, is_train=train)
예제 #14
0
def main(device, org, train):
    global GO_IDS
    GO_IDS = FUNC_DICT.values()
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl')
    global functions
    functions = func_df['functions'].values
    func_df = pd.read_pickle(DATA_ROOT + 'mf.pkl')
    functions = np.concatenate((functions, func_df['functions'].values))
    func_df = pd.read_pickle(DATA_ROOT + 'cc.pkl')
    functions = np.concatenate((functions, func_df['functions'].values))
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = (
        get_go_set(go, GO_IDS[0])
        | get_go_set(go, GO_IDS[1])
        | get_go_set(go, GO_IDS[2]))
    logging.info('Functions: %d' % (len(functions), ))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        model(params, is_train=train)
예제 #15
0
def main(function, split):
    global SPLIT
    SPLIT = split
    global GO_ID
    GO_ID = FUNC_DICT[function]
    global go
    go = get_gene_ontology('go.obo')
    global FUNCTION
    FUNCTION = function
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = get_go_set(go, GO_ID)
    print len(functions)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    run()
예제 #16
0
def main(function, device, model_name):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info(len(functions))
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    with tf.device('/' + device):
        model(model_name)
예제 #17
0
def main(function, annot_num):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global functions
    functions = deque()
    dfs(GO_ID)
    functions.remove(GO_ID)
    functions = list(functions)
    print((len(functions)))
    global func_set
    func_set = set(functions)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind

    get_functions(annot_num)
예제 #18
0
def compute_performance():
    root = 'data/cafa3/'
    preds = {}
    annots = {}
    go = get_gene_ontology()
    with open(root + 'test_predictions.tab', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            preds[items[0]] = set(items[1:])
    with open(root + 'test_annotations.tab', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            annots[items[0]] = set()
            for go_id in items[1:]:
                if go_id in go:
                    annots[items[0]] |= get_anchestors(go, go_id)

    total = 0
    p = 0.0
    r = 0.0
    f = 0.0
    for prot, pred_annots in preds.iteritems():
        real_annots = annots[prot]
        if len(real_annots) == 0:
            continue
        tp = len(real_annots.intersection(pred_annots))
        fp = len(pred_annots - real_annots)
        fn = len(real_annots - pred_annots)
        if tp == 0 and fp == 0 and fn == 0:
            continue
        total += 1
        if tp != 0:
            precision = tp / (1.0 * (tp + fp))
            recall = tp / (1.0 * (tp + fn))
            p += precision
            r += recall
            f += 2 * precision * recall / (precision + recall)
    print(f / total, p / total, r / total)
예제 #19
0
    Dense, Dropout, Activation, Flatten)
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.optimizers import SGD
from sklearn.metrics import classification_report
from keras.utils import np_utils
from utils import (
    shuffle, train_val_test_split, get_gene_ontology)
import sys
import os
from collections import deque

LAMBDA = 24
DATA_ROOT = 'data/molecular_functions/paac/'

go = get_gene_ontology()
go_model = dict()


def load_data(go_id):
    pass


def get_model(
        go_id,
        max_features=10000,
        embedding_dims=100,
        nb_filters=250,
        hidden_dims=250,
        pool_length=2,
        filter_length=3):
예제 #20
0
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import (Dense, Dropout, Activation, Input, Flatten, merge)
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import (Convolution1D, MaxPooling1D)
from sklearn.metrics import classification_report
from utils import (shuffle, get_gene_ontology)
from keras.callbacks import ModelCheckpoint, EarlyStopping
import sys
from aaindex import (AAINDEX)
from collections import deque
import pdb

DATA_ROOT = 'yeast/'
MAXLEN = 500
go = get_gene_ontology('goslim_yeast.obo')


def get_go_set(go_id):
    go_set = set()
    q = deque()
    q.append(go_id)
    while len(q) > 0:
        g_id = q.popleft()
        go_set.add(g_id)
        for ch_id in go[g_id]['children']:
            q.append(ch_id)
    return go_set


functions = get_go_set('GO:0003674')
예제 #21
0
파일: evaluation.py 프로젝트: oeway/deepgo
def main(function):
    global go
    go = get_gene_ontology()
    func_df = pd.read_pickle(DATA_ROOT + function + '.pkl')
    global functions
    functions = func_df['functions'].values
    func_index = dict()
    for i, go_id in enumerate(functions):
        func_index[go_id] = i
    global func_set
    func_set = set(func_index)
    global GO_ID
    GO_ID = FUNC_DICT[function]
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    pred_df = pd.read_pickle(DATA_ROOT + 'model_preds_' + function + '.pkl')
    # FFPred preds
    preds_dict = {}
    # files = os.listdir('data/ffpred/')
    # for fl in files:
    # with open('data/gofdr/predictions.tab') as f:
    #     for line in f:
    #         it = line.strip().split('\t')
    #         target_id = it[0]
    #         if function[1].upper() != it[2]:
    #             continue
    #         if target_id not in preds_dict:
    #             preds_dict[target_id] = list()
    #         preds_dict[target_id].append((it[1], float(it[3])))
    # print(len(preds_dict))
    target_ids = list()
    predictions = list()
    for key, val in preds_dict.items():
        target_ids.append(key)
        predictions.append(val)
    # pred_df = pd.DataFrame({'targets': target_ids, 'predictions': predictions})

    targets = dict()
    with open('data/cafa3/CAFA3_benchmark20170605/groundtruth/leafonly_' +
              function.upper() + 'O_unique.txt') as f:
        for line in f:
            it = line.strip().split('\t')
            target = it[0]
            go_id = it[1]
            if target not in targets:
                targets[target] = list()
            targets[target].append(go_id)
    target_ids = list()
    labels = list()
    go_ids = list()
    for target, gos in targets.items():
        go_set = set()
        for go_id in gos:
            if go_id in all_functions:
                go_set |= get_anchestors(go, go_id)
        label = np.zeros((len(functions), ), dtype=np.int32)
        for go_id in go_set:
            if go_id in func_index:
                label[func_index[go_id]] = 1
        target_ids.append(target)
        go_ids.append(go_set)
        labels.append(label)
    df = pd.DataFrame({'targets': target_ids, 'gos': go_ids, 'labels': labels})
    df = pd.merge(df, pred_df, on='targets', how='inner')
    df.to_pickle(DATA_ROOT + 'model_preds_filtered_' + function + '.pkl')

    def reshape(values):
        values = np.hstack(values).reshape(len(values), len(values[0]))
        return values

    preds = reshape(df['predictions'].values)
    labels = reshape(df['labels'].values)
    # preds = df['predictions'].values
    gos = df['gos'].values
    f, p, r, t, preds_max = compute_performance(preds, labels, gos)
    print(f, p, r)
    # labels = list()
    # scores = list()
    # for i in range(len(preds)):
    #     all_gos = set()
    #     for go_id in gos[i]:
    #         if go_id in all_functions:
    #             all_gos |= get_anchestors(go, go_id)
    #     all_gos.discard(GO_ID)
    #     scores_dict = {}
    #     for val in preds[i]:
    #         go_id, score = val
    #         if go_id in all_functions:
    #             go_set = get_anchestors(go, go_id)
    #             for g_id in go_set:
    #                 if g_id not in scores_dict or scores_dict[g_id] < score:
    #                     scores_dict[g_id] = score
    #     all_preds = set(scores_dict) # | all_gos
    #     all_preds.discard(GO_ID)
    #     for go_id in all_preds:
    #         if go_id in scores_dict:
    #             scores.append(scores_dict[go_id])
    #         else:
    #             scores.append(0)
    #         if go_id in all_gos:
    #             labels.append(1)
    #         else:
    #             labels.append(0)

    # scores = np.array(scores)
    # labels = np.array(labels)
    roc_auc = compute_roc(preds, labels)
    print(roc_auc)
    # preds_max = (scores > t).astype(np.int32)
    mcc = compute_mcc(preds_max, labels)
    print(mcc)
예제 #22
0
from keras.layers.convolutional import (
    Convolution1D, MaxPooling1D)
from sklearn.metrics import classification_report
from utils import (
    shuffle,
    get_gene_ontology)
from keras.callbacks import ModelCheckpoint, EarlyStopping
import sys
from aaindex import (
    AAINDEX)
from collections import deque
import pdb

DATA_ROOT = 'yeast/'
MAXLEN = 500
go = get_gene_ontology('goslim_yeast.obo')


def get_go_set(go_id):
    go_set = set()
    q = deque()
    q.append(go_id)
    while len(q) > 0:
        g_id = q.popleft()
        go_set.add(g_id)
        for ch_id in go[g_id]['children']:
            q.append(ch_id)
    return go_set

functions = get_go_set('GO:0003674')
functions.remove('GO:0003674')
예제 #23
0
    get_gene_ontology
)
import os
import sys
import pdb
from keras.optimizers import Adam
import shutil
from collections import deque
import pandas as pd

LAMBDA = 24
DATA_ROOT = 'data/fofe/'
CUR_LEVEL = 'level_1/'
NEXT_LEVEL = 'level_2/'

go = get_gene_ontology()
go_model = dict()

MAXLEN = 500


def get_gos_by_prot_id():
    data = dict()
    with open(DATA_ROOT + 'train.txt', 'r') as f:
        prot_id = 0
        for line in f:
            line = line.strip().split('\t')
            gos = line[2].split('; ')
            go_set = set()
            for go_id in gos:
                go_set.add(go_id)
예제 #24
0
#!/usr/bin/env python
import sys
import numpy as np
import pandas as pd
from keras.utils import np_utils
from utils import get_gene_ontology
from collections import deque

DATA_ROOT = 'data/fofe/'
FILENAME = 'train.txt'

go = get_gene_ontology('go.obo')


def get_go_set(go_id):
    go_set = set()
    q = deque()
    q.append(go_id)
    while len(q) > 0:
        g_id = q.popleft()
        go_set.add(g_id)
        for ch_id in go[g_id]['children']:
            q.append(ch_id)
    return go_set


functions = get_go_set('GO:0003674')


def get_anchestors(go_id):
    go_set = set()
예제 #25
0
#!/usr/bin/env python
import sys
import numpy as np
import pandas as pd
from keras.utils import np_utils
from utils import get_gene_ontology
from collections import deque


DATA_ROOT = "data/fofe/"
FILENAME = "train.txt"


go = get_gene_ontology("go.obo")


def get_go_set(go_id):
    go_set = set()
    q = deque()
    q.append(go_id)
    while len(q) > 0:
        g_id = q.popleft()
        go_set.add(g_id)
        for ch_id in go[g_id]["children"]:
            q.append(ch_id)
    return go_set


functions = get_go_set("GO:0003674")

예제 #26
0
def get_predictions():
    root = 'data/cafa3/'
    annots = {}
    preds = {}
    go = get_gene_ontology()
    mf = pd.read_pickle(root + 'mf.pkl')
    mf_df = pd.read_pickle(root + 'test-mf-preds.pkl')
    functions = mf['functions']
    for i, row in mf_df.iterrows():
        prot_id = row['proteins']
        if prot_id not in preds:
            preds[prot_id] = set()
        for i in xrange(len(functions)):
            if row['predictions'][i] == 1:
                preds[prot_id].add(functions[i])
        if prot_id not in annots:
            annots[prot_id] = row['gos']

    cc = pd.read_pickle(root + 'cc.pkl')
    cc_df = pd.read_pickle(root + 'test-cc-preds.pkl')
    functions = cc['functions']
    for i, row in cc_df.iterrows():
        prot_id = row['proteins']
        if prot_id not in preds:
            preds[prot_id] = set()
        for i in xrange(len(functions)):
            if row['predictions'][i] == 1:
                preds[prot_id].add(functions[i])
        if prot_id not in annots:
            annots[prot_id] = row['gos']

    bp = pd.read_pickle(root + 'bp.pkl')
    bp_df = pd.read_pickle(root + 'test-bp-preds.pkl')
    functions = bp['functions']
    for i, row in bp_df.iterrows():
        prot_id = row['proteins']
        if prot_id not in preds:
            preds[prot_id] = set()
        for i in xrange(len(functions)):
            if row['predictions'][i] == 1:
                preds[prot_id].add(functions[i])
        if prot_id not in annots:
            annots[prot_id] = row['gos']

    # Removing parent classes
    for prot_id in preds:
        go_set = preds[prot_id]
        gos = go_set.copy()
        for go_id in gos:
            anchestors = get_anchestors(go, go_id)
            anchestors.remove(go_id)
            go_set -= anchestors

    proteins = sorted(annots.keys(),
                      key=lambda x: (x.split('_')[1], x.split('_')[0]))
    with open(root + 'test_predictions.tab', 'w') as f:
        for prot_id in proteins:
            f.write(prot_id)
            for go_id in preds[prot_id]:
                f.write('\t' + go_id)
            f.write('\n')

    with open(root + 'test_annotations.tab', 'w') as f:
        for prot_id in proteins:
            f.write(prot_id)
            for go_id in annots[prot_id]:
                if go_id in go:
                    f.write('\t' + go_id)
            f.write('\n')