Exemplo n.º 1
0
def main(split):
    global SPLIT
    SPLIT = split
    global GO_IDS
    GO_IDS = list(FUNC_DICT.values())
    global go
    go = get_gene_ontology('go.obo')
    func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl')
    global functions
    functions = func_df['functions'].values
    func_df = pd.read_pickle(DATA_ROOT + 'mf.pkl')
    functions = np.concatenate((functions, func_df['functions'].values))
    func_df = pd.read_pickle(DATA_ROOT + 'cc.pkl')
    functions = np.concatenate((functions, func_df['functions'].values))
    global func_set
    func_set = (
        get_go_set(go, GO_IDS[0])
        | get_go_set(go, GO_IDS[1])
        | get_go_set(go, GO_IDS[2]))
    print(len(functions))
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    run()
Exemplo n.º 2
0
def main(function, device, org, train):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        model(is_train=train)
Exemplo n.º 3
0
def main(function, device, org, train, param, embeddingmethod, shuffleseed,
         buildmethod, evomodel, cached):
    global CACHED
    CACHED = cached
    global BUILDMETHOD
    BUILDMETHOD = buildmethod
    global EVOMODEL
    EVOMODEL = evomodel
    global EMBEDDINGMETHOD
    EMBEDDINGMETHOD = embeddingmethod
    global SEED
    SEED = shuffleseed
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    global experiment_id
    experiment_id = str(function) + '-' + str(embeddingmethod) + '-' + str(
        shuffleseed) + '-' + str(buildmethod) + '-' + str(evomodel)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    a = experiment_id
    global resdir
    resdir = "results/" + experiment_id
    if not os.path.isdir(resdir):
        os.mkdir(resdir)
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 3,
            'nb_dense': 2,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        # model(params, is_train=train)
        model(params, is_train=True)
Exemplo n.º 4
0
def main(function, device, org, train, param):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 3,
            'nb_dense': 2,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        model(params, is_train=train)
        dims = [64, 128, 256, 512]
        nb_filters = [16, 32, 64, 128]
        nb_convs = [1, 2, 3, 4]
        nb_dense = [1, 2, 3, 4]
        for i in range(param * 32, param * 32 + 32):
            dim = i % 4
            i = i / 4
            nb_fil = i % 4
            i /= 4
            conv = i % 4
            i /= 4
            den = i
            params['embedding_dims'] = dims[dim]
            params['nb_filter'] = nb_filters[nb_fil]
            params['nb_conv'] = nb_convs[conv]
            params['nb_dense'] = nb_dense[den]
Exemplo n.º 5
0
def main(device, org, train):
    global GO_IDS
    GO_IDS = FUNC_DICT.values()
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl')
    global functions
    functions = func_df['functions'].values
    func_df = pd.read_pickle(DATA_ROOT + 'mf.pkl')
    functions = np.concatenate((functions, func_df['functions'].values))
    func_df = pd.read_pickle(DATA_ROOT + 'cc.pkl')
    functions = np.concatenate((functions, func_df['functions'].values))
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = (
        get_go_set(go, GO_IDS[0])
        | get_go_set(go, GO_IDS[1])
        | get_go_set(go, GO_IDS[2]))
    logging.info('Functions: %d' % (len(functions), ))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        model(params, is_train=train)
def main(function, test_df, device):
    org = None
    param = 0
    filename = 'ResultSequenceStructPPI.txt'
    train = False
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    global go_indexes
    go_indexes = dict()
    #will be used for my prediction list
    indexes_for_prediction = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
        indexes_for_prediction[ind] = go_id
    global node_names
    global FILENAME
    FILENAME = filename
    global PARAMS
    node_names = set()
    global prediction_list
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 1,
            'nb_dense': 1,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        PARAMS = params
        prediction_list = model(params, test_df, is_train=train)
    return prediction_list
Exemplo n.º 7
0
def main(function, device, org, train, param, filename):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    global FILENAME
    FILENAME = filename
    global PARAMS
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 1,
            'nb_dense': 1,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32
        }
        PARAMS = params
        model(params, is_train=train)
Exemplo n.º 8
0
def main(function, split):
    global SPLIT
    SPLIT = split
    global GO_ID
    GO_ID = FUNC_DICT[function]
    global go
    go = get_gene_ontology('go.obo')
    global FUNCTION
    FUNCTION = function
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = get_go_set(go, GO_ID)
    print len(functions)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    run()
Exemplo n.º 9
0
def get_layers_recursive(inputs, node_output_dim=256):
    layers = dict()
    name = get_node_name(GO_ID)
    inputs = Dense(
        node_output_dim, activation='relu', name=name)(inputs)

    def dfs(node_id, inputs):
        name = get_node_name(node_id, unique=True)
        net, output = get_function_node(name, inputs, node_output_dim)
        childs = [
            n_id for n_id in go[node_id]['children'] if n_id in func_set]
        if node_id not in layers:
            layers[node_id] = {'outputs': [output]}
        else:
            layers[node_id]['outputs'].append(output)
        for ch_id in childs:
            dfs(ch_id, net)

    for node_id in go[GO_ID]['children']:
        if node_id in func_set:
            dfs(node_id, inputs)

    for node_id in functions:
        childs = get_go_set(go, node_id).intersection(func_set)
        if len(childs) == 0:
            if len(layers[node_id]['outputs']) == 1:
                layers[node_id]['output'] = layers[node_id]['outputs'][0]
            else:
                name = get_node_name(node_id, unique=True)
                output = merge(
                    layers[node_id]['outputs'], mode='max', name=name)
                layers[node_id]['output'] = output
        else:
            outputs = layers[node_id]['outputs']
            for ch_id in childs:
                outputs += layers[ch_id]['outputs']
            name = get_node_name(node_id, unique=True)
            output = merge(
                outputs, mode='max', name=name)
            layers[node_id]['output'] = output
    return layers
Exemplo n.º 10
0
def main(function, device, model_name):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info(len(functions))
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    with tf.device('/' + device):
        model(model_name)
Exemplo n.º 11
0
def main(function):
    global go
    go = get_gene_ontology()
    func_df = pd.read_pickle(DATA_ROOT + function + '.pkl')
    global functions
    functions = func_df['functions'].values
    func_index = dict()
    for i, go_id in enumerate(functions):
        func_index[go_id] = i
    global func_set
    func_set = set(func_index)
    global GO_ID
    GO_ID = FUNC_DICT[function]
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    pred_df = pd.read_pickle(DATA_ROOT + 'model_preds_' + function + '.pkl')
    # FFPred preds
    preds_dict = {}
    # files = os.listdir('data/ffpred/')
    # for fl in files:
    # with open('data/gofdr/predictions.tab') as f:
    #     for line in f:
    #         it = line.strip().split('\t')
    #         target_id = it[0]
    #         if function[1].upper() != it[2]:
    #             continue
    #         if target_id not in preds_dict:
    #             preds_dict[target_id] = list()
    #         preds_dict[target_id].append((it[1], float(it[3])))
    # print(len(preds_dict))
    target_ids = list()
    predictions = list()
    for key, val in preds_dict.items():
        target_ids.append(key)
        predictions.append(val)
    # pred_df = pd.DataFrame({'targets': target_ids, 'predictions': predictions})

    targets = dict()
    with open('data/cafa3/CAFA3_benchmark20170605/groundtruth/leafonly_' +
              function.upper() + 'O_unique.txt') as f:
        for line in f:
            it = line.strip().split('\t')
            target = it[0]
            go_id = it[1]
            if target not in targets:
                targets[target] = list()
            targets[target].append(go_id)
    target_ids = list()
    labels = list()
    go_ids = list()
    for target, gos in targets.items():
        go_set = set()
        for go_id in gos:
            if go_id in all_functions:
                go_set |= get_anchestors(go, go_id)
        label = np.zeros((len(functions), ), dtype=np.int32)
        for go_id in go_set:
            if go_id in func_index:
                label[func_index[go_id]] = 1
        target_ids.append(target)
        go_ids.append(go_set)
        labels.append(label)
    df = pd.DataFrame({'targets': target_ids, 'gos': go_ids, 'labels': labels})
    df = pd.merge(df, pred_df, on='targets', how='inner')
    df.to_pickle(DATA_ROOT + 'model_preds_filtered_' + function + '.pkl')

    def reshape(values):
        values = np.hstack(values).reshape(len(values), len(values[0]))
        return values

    preds = reshape(df['predictions'].values)
    labels = reshape(df['labels'].values)
    # preds = df['predictions'].values
    gos = df['gos'].values
    f, p, r, t, preds_max = compute_performance(preds, labels, gos)
    print(f, p, r)
    # labels = list()
    # scores = list()
    # for i in range(len(preds)):
    #     all_gos = set()
    #     for go_id in gos[i]:
    #         if go_id in all_functions:
    #             all_gos |= get_anchestors(go, go_id)
    #     all_gos.discard(GO_ID)
    #     scores_dict = {}
    #     for val in preds[i]:
    #         go_id, score = val
    #         if go_id in all_functions:
    #             go_set = get_anchestors(go, go_id)
    #             for g_id in go_set:
    #                 if g_id not in scores_dict or scores_dict[g_id] < score:
    #                     scores_dict[g_id] = score
    #     all_preds = set(scores_dict) # | all_gos
    #     all_preds.discard(GO_ID)
    #     for go_id in all_preds:
    #         if go_id in scores_dict:
    #             scores.append(scores_dict[go_id])
    #         else:
    #             scores.append(0)
    #         if go_id in all_gos:
    #             labels.append(1)
    #         else:
    #             labels.append(0)

    # scores = np.array(scores)
    # labels = np.array(labels)
    roc_auc = compute_roc(preds, labels)
    print(roc_auc)
    # preds_max = (scores > t).astype(np.int32)
    mcc = compute_mcc(preds_max, labels)
    print(mcc)
DATA_ROOT = 'data/'
MAXLEN = 1000

global FUNCTION
FUNCTION = function
global GO_ID
GO_ID = FUNC_DICT[FUNCTION]
global go
go = get_gene_ontology('go.obo')
func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
global functions
functions = func_df['functions'].values
global func_set
func_set = set(functions)
global all_functions
all_functions = get_go_set(go, GO_ID)


def compute_roc(preds, labels):
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten())
    roc_auc = auc(fpr, tpr)
    return roc_auc


def compute_mcc(preds, labels):
    # Compute ROC curve and ROC area for each class
    mcc = matthews_corrcoef(labels.flatten(), preds.flatten())
    return mcc