def main(split): global SPLIT SPLIT = split global GO_IDS GO_IDS = list(FUNC_DICT.values()) global go go = get_gene_ontology('go.obo') func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl') global functions functions = func_df['functions'].values func_df = pd.read_pickle(DATA_ROOT + 'mf.pkl') functions = np.concatenate((functions, func_df['functions'].values)) func_df = pd.read_pickle(DATA_ROOT + 'cc.pkl') functions = np.concatenate((functions, func_df['functions'].values)) global func_set func_set = ( get_go_set(go, GO_IDS[0]) | get_go_set(go, GO_IDS[1]) | get_go_set(go, GO_IDS[2])) print(len(functions)) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind run()
def main(function, device, org, train): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names node_names = set() with tf.device('/' + device): model(is_train=train)
def main(function, device, org, train, param, embeddingmethod, shuffleseed, buildmethod, evomodel, cached): global CACHED CACHED = cached global BUILDMETHOD BUILDMETHOD = buildmethod global EVOMODEL EVOMODEL = evomodel global EMBEDDINGMETHOD EMBEDDINGMETHOD = embeddingmethod global SEED SEED = shuffleseed global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) global experiment_id experiment_id = str(function) + '-' + str(embeddingmethod) + '-' + str( shuffleseed) + '-' + str(buildmethod) + '-' + str(evomodel) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) a = experiment_id global resdir resdir = "results/" + experiment_id if not os.path.isdir(resdir): os.mkdir(resdir) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names node_names = set() with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'nb_conv': 3, 'nb_dense': 2, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } # model(params, is_train=train) model(params, is_train=True)
def main(function, device, org, train, param): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names node_names = set() with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'nb_conv': 3, 'nb_dense': 2, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } model(params, is_train=train) dims = [64, 128, 256, 512] nb_filters = [16, 32, 64, 128] nb_convs = [1, 2, 3, 4] nb_dense = [1, 2, 3, 4] for i in range(param * 32, param * 32 + 32): dim = i % 4 i = i / 4 nb_fil = i % 4 i /= 4 conv = i % 4 i /= 4 den = i params['embedding_dims'] = dims[dim] params['nb_filter'] = nb_filters[nb_fil] params['nb_conv'] = nb_convs[conv] params['nb_dense'] = nb_dense[den]
def main(device, org, train): global GO_IDS GO_IDS = FUNC_DICT.values() global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl') global functions functions = func_df['functions'].values func_df = pd.read_pickle(DATA_ROOT + 'mf.pkl') functions = np.concatenate((functions, func_df['functions'].values)) func_df = pd.read_pickle(DATA_ROOT + 'cc.pkl') functions = np.concatenate((functions, func_df['functions'].values)) global func_set func_set = set(functions) global all_functions all_functions = ( get_go_set(go, GO_IDS[0]) | get_go_set(go, GO_IDS[1]) | get_go_set(go, GO_IDS[2])) logging.info('Functions: %d' % (len(functions), )) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names node_names = set() with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } model(params, is_train=train)
def main(function, test_df, device): org = None param = 0 filename = 'ResultSequenceStructPPI.txt' train = False global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) global go_indexes go_indexes = dict() #will be used for my prediction list indexes_for_prediction = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind indexes_for_prediction[ind] = go_id global node_names global FILENAME FILENAME = filename global PARAMS node_names = set() global prediction_list with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'nb_conv': 1, 'nb_dense': 1, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } PARAMS = params prediction_list = model(params, test_df, is_train=train) return prediction_list
def main(function, device, org, train, param, filename): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names global FILENAME FILENAME = filename global PARAMS node_names = set() with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'nb_conv': 1, 'nb_dense': 1, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } PARAMS = params model(params, is_train=train)
def main(function, split): global SPLIT SPLIT = split global GO_ID GO_ID = FUNC_DICT[function] global go go = get_gene_ontology('go.obo') global FUNCTION FUNCTION = function func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = get_go_set(go, GO_ID) print len(functions) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind run()
def get_layers_recursive(inputs, node_output_dim=256): layers = dict() name = get_node_name(GO_ID) inputs = Dense( node_output_dim, activation='relu', name=name)(inputs) def dfs(node_id, inputs): name = get_node_name(node_id, unique=True) net, output = get_function_node(name, inputs, node_output_dim) childs = [ n_id for n_id in go[node_id]['children'] if n_id in func_set] if node_id not in layers: layers[node_id] = {'outputs': [output]} else: layers[node_id]['outputs'].append(output) for ch_id in childs: dfs(ch_id, net) for node_id in go[GO_ID]['children']: if node_id in func_set: dfs(node_id, inputs) for node_id in functions: childs = get_go_set(go, node_id).intersection(func_set) if len(childs) == 0: if len(layers[node_id]['outputs']) == 1: layers[node_id]['output'] = layers[node_id]['outputs'][0] else: name = get_node_name(node_id, unique=True) output = merge( layers[node_id]['outputs'], mode='max', name=name) layers[node_id]['output'] = output else: outputs = layers[node_id]['outputs'] for ch_id in childs: outputs += layers[ch_id]['outputs'] name = get_node_name(node_id, unique=True) output = merge( outputs, mode='max', name=name) layers[node_id]['output'] = output return layers
def main(function, device, model_name): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info(len(functions)) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind with tf.device('/' + device): model(model_name)
def main(function): global go go = get_gene_ontology() func_df = pd.read_pickle(DATA_ROOT + function + '.pkl') global functions functions = func_df['functions'].values func_index = dict() for i, go_id in enumerate(functions): func_index[go_id] = i global func_set func_set = set(func_index) global GO_ID GO_ID = FUNC_DICT[function] global all_functions all_functions = get_go_set(go, GO_ID) pred_df = pd.read_pickle(DATA_ROOT + 'model_preds_' + function + '.pkl') # FFPred preds preds_dict = {} # files = os.listdir('data/ffpred/') # for fl in files: # with open('data/gofdr/predictions.tab') as f: # for line in f: # it = line.strip().split('\t') # target_id = it[0] # if function[1].upper() != it[2]: # continue # if target_id not in preds_dict: # preds_dict[target_id] = list() # preds_dict[target_id].append((it[1], float(it[3]))) # print(len(preds_dict)) target_ids = list() predictions = list() for key, val in preds_dict.items(): target_ids.append(key) predictions.append(val) # pred_df = pd.DataFrame({'targets': target_ids, 'predictions': predictions}) targets = dict() with open('data/cafa3/CAFA3_benchmark20170605/groundtruth/leafonly_' + function.upper() + 'O_unique.txt') as f: for line in f: it = line.strip().split('\t') target = it[0] go_id = it[1] if target not in targets: targets[target] = list() targets[target].append(go_id) target_ids = list() labels = list() go_ids = list() for target, gos in targets.items(): go_set = set() for go_id in gos: if go_id in all_functions: go_set |= get_anchestors(go, go_id) label = np.zeros((len(functions), ), dtype=np.int32) for go_id in go_set: if go_id in func_index: label[func_index[go_id]] = 1 target_ids.append(target) go_ids.append(go_set) labels.append(label) df = pd.DataFrame({'targets': target_ids, 'gos': go_ids, 'labels': labels}) df = pd.merge(df, pred_df, on='targets', how='inner') df.to_pickle(DATA_ROOT + 'model_preds_filtered_' + function + '.pkl') def reshape(values): values = np.hstack(values).reshape(len(values), len(values[0])) return values preds = reshape(df['predictions'].values) labels = reshape(df['labels'].values) # preds = df['predictions'].values gos = df['gos'].values f, p, r, t, preds_max = compute_performance(preds, labels, gos) print(f, p, r) # labels = list() # scores = list() # for i in range(len(preds)): # all_gos = set() # for go_id in gos[i]: # if go_id in all_functions: # all_gos |= get_anchestors(go, go_id) # all_gos.discard(GO_ID) # scores_dict = {} # for val in preds[i]: # go_id, score = val # if go_id in all_functions: # go_set = get_anchestors(go, go_id) # for g_id in go_set: # if g_id not in scores_dict or scores_dict[g_id] < score: # scores_dict[g_id] = score # all_preds = set(scores_dict) # | all_gos # all_preds.discard(GO_ID) # for go_id in all_preds: # if go_id in scores_dict: # scores.append(scores_dict[go_id]) # else: # scores.append(0) # if go_id in all_gos: # labels.append(1) # else: # labels.append(0) # scores = np.array(scores) # labels = np.array(labels) roc_auc = compute_roc(preds, labels) print(roc_auc) # preds_max = (scores > t).astype(np.int32) mcc = compute_mcc(preds_max, labels) print(mcc)
DATA_ROOT = 'data/' MAXLEN = 1000 global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) def compute_roc(preds, labels): # Compute ROC curve and ROC area for each class fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten()) roc_auc = auc(fpr, tpr) return roc_auc def compute_mcc(preds, labels): # Compute ROC curve and ROC area for each class mcc = matthews_corrcoef(labels.flatten(), preds.flatten()) return mcc