Пример #1
0
def get_sequences():
    data = list()
    with open('data/cafa3/targets.txt', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            if is_ok(items[1]):
                data.append(items[1])

    with open('data/cafa3/data.txt', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            if is_ok(items[1]):
                data.append(items[1])
    return data
Пример #2
0
def get_functions(annot_num):
    df = pd.read_pickle(DATA_ROOT + 'swissprot_exp.pkl')
    annots = dict()
    for i, row in df.iterrows():
        go_set = set()
        if not is_ok(row['sequences']):
            continue
        for go_id in row['annots']:
            go_id = go_id.split('|')
            if go_id[1] not in EXP_CODES:
                continue
            go_id = go_id[0]
            if go_id in func_set:
                go_set |= get_anchestors(go, go_id)
        for go_id in go_set:
            if go_id not in annots:
                annots[go_id] = 0
            annots[go_id] += 1
    filtered = list()
    for go_id in functions:
        if go_id in annots and annots[go_id] >= annot_num:
            filtered.append(go_id)
    print(len(filtered))
    df = pd.DataFrame({'functions': filtered})
    df.to_pickle(DATA_ROOT + FUNCTION + '.pkl')
    print('Saved ' + DATA_ROOT + FUNCTION + '.pkl')
Пример #3
0
def load_data():
    ngram_df = pd.read_pickle(DATA_ROOT + 'ngrams.pkl')
    vocab = {}
    for key, gram in enumerate(ngram_df['ngrams']):
        vocab[gram] = key + 1
    gram_len = len(ngram_df['ngrams'][0])
    print('Gram length:', gram_len)
    print('Vocabulary size:', len(vocab))
    proteins = list()
    gos = list()
    labels = list()
    ngrams = list()
    sequences = list()
    accessions = list()
    df = pd.read_pickle(DATA_ROOT + 'swissprot_exp.pkl')
    # Filtering data by sequences
    index = list()
    for i, row in df.iterrows():
        if is_ok(row['sequences']):
            index.append(i)
    df = df.loc[index]

    for i, row in df.iterrows():
        go_list = []
        for item in row['annots']:
            items = item.split('|')
            if items[1] in EXP_CODES:
                go_list.append(items[0])
            # go_list.append(items[0])
        go_set = set()
        for go_id in go_list:
            if go_id in func_set:
                go_set |= get_anchestors(go, go_id)
        if not go_set or GO_ID not in go_set:
            continue
        go_set.remove(GO_ID)
        gos.append(go_list)
        proteins.append(row['proteins'])
        accessions.append(row['accessions'])
        seq = row['sequences']
        sequences.append(seq)
        grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
        for i in xrange(len(seq) - gram_len + 1):
            grams[i] = vocab[seq[i:(i + gram_len)]]
        ngrams.append(grams)
        label = np.zeros((len(functions), ), dtype='int32')
        for go_id in go_set:
            if go_id in go_indexes:
                label[go_indexes[go_id]] = 1
        labels.append(label)
    res_df = pd.DataFrame({
        'accessions': accessions,
        'proteins': proteins,
        'ngrams': ngrams,
        'labels': labels,
        'gos': gos,
        'sequences': sequences
    })
    print(len(res_df))
    return res_df
Пример #4
0
def to_pandas():
    ngram_df = pd.read_pickle('data/ngrams.pkl')
    vocab = {}
    for key, gram in enumerate(ngram_df['ngrams']):
        vocab[gram] = key + 1
    gram_len = len(ngram_df['ngrams'][0])
    print('Gram length:', gram_len)
    print('Vocabulary size:', len(vocab))

    proteins = list()
    accessions = list()
    sequences = list()
    interpros = list()
    ngrams = list()
    indexes = list()
    counter = Counter()
    maxlen = 0
    with open('data/data.tsv') as f:
        for line in f:
            items = line.strip().split('\t')
            seq = items[2]
            if not is_ok(seq) or len(seq) > 1600:
                continue
            proteins.append(items[0])
            accessions.append(items[1].split(';')[0])
            maxlen = max(maxlen, len(seq))
            sequences.append(seq)
            grams = list()
            for i in range(len(seq) - gram_len + 1):
                grams.append(vocab[seq[i: (i + gram_len)]])
            index = np.array([AAINDEX[x] for x in seq])
            indexes.append(index)
            ngrams.append(np.array(grams))
            interpros.append(items[3:])
            for item in items[3:]:
                counter[item] += 1
    print('Maximum sequence length: ', maxlen)
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'ngrams': ngrams,
        'interpros': interpros,
        'indexes': indexes
    })
    print(df)
    df.to_pickle('data/data.pkl')
    dictionary = list()
    for ipro, cnt in counter.items():
        if cnt >= 100:
            dictionary.append(ipro)
    dict_df = pd.DataFrame({'interpros': dictionary})
    print(dict_df)
    dict_df.to_pickle('data/dictionary.pkl')
Пример #5
0
def get_data():
    proteins = list()
    targets = list()
    orgs = list()
    ngrams = list()
    ngram_df = pd.read_pickle('data/eshark/ngrams.pkl')
    vocab = {}
    mapping = get_blast_mapping()
    for key, gram in enumerate(ngram_df['ngrams']):
        vocab[gram] = key + 1
    gram_len = len(ngram_df['ngrams'][0])
    print('Gram length:', gram_len)
    print('Vocabulary size:', len(vocab))

    with open('data/eshark/targets.txt') as f:
        for line in f:
            it = line.strip().split('\t')
            seq = it[1]
            if is_ok(seq):
                # orgs.append(it[0])
                targets.append(it[0])
                if it[0] in mapping:
                    proteins.append(mapping[it[0]])
                else:
                    proteins.append('')
                grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
                for i in xrange(len(seq) - gram_len + 1):
                    grams[i] = vocab[seq[i:(i + gram_len)]]
                ngrams.append(grams)

    df = pd.DataFrame({
        'targets': targets,
        'accessions': proteins,
        'ngrams': ngrams
    })

    print(len(df))
    embed_df = pd.read_pickle('data/graph_new_embeddings.pkl')

    df = pd.merge(df, embed_df, on='accessions', how='left')

    missing_rep = 0
    for i, row in df.iterrows():
        if not isinstance(row['embeddings'], np.ndarray):
            row['embeddings'] = np.zeros((256, ), dtype='float32')
            missing_rep += 1
    print(missing_rep)

    df.to_pickle('data/eshark/targets.pkl')
Пример #6
0
def filter_exp():
    df = pd.read_pickle(DATA_ROOT + 'swissprot.pkl')
    exp_codes = set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'TAS', 'IC'])
    index = list()
    for i, row in df.iterrows():
        ok = False
        for go_id in row['annots']:
            code = go_id.split('|')[1]
            if code in exp_codes:
                ok = True
                break
        if ok and is_ok(row['sequences']):
            index.append(i)
    df = df.loc[index]
    print(len(df))
    df.to_pickle(DATA_ROOT + 'swissprot_exp.pkl')
Пример #7
0
def cafa3():
    root = 'data/cafa3/CAFA3_training_data/'
    filename = root + 'uniprot_sprot_exp.fasta'
    data = read_fasta(filename)
    annots = dict()
    with open(root + 'uniprot_sprot_exp.txt') as f:
        for line in f:
            items = line.strip().split('\t')
            if items[0] not in annots:
                annots[items[0]] = set()
            annots[items[0]].add(items[1])
    fl = open(root + 'uniprot_sprot.tab', 'w')
    for line in data:
        items = line.split('\t')
        if is_ok(items[1]) and items[0] in annots:
            fl.write(line + '\t')
            gos = list(annots[items[0]])
            fl.write(gos[0])
            for go_id in gos[1:]:
                fl.write('; ' + go_id)
            fl.write('\n')
Пример #8
0
def load_data():
    ngram_df = pd.read_pickle('data/ngrams.pkl')
    vocab = {}
    for key, gram in enumerate(ngram_df['ngrams']):
        vocab[gram] = key + 1
    gram_len = len(ngram_df['ngrams'][0])
    print('Gram length:', gram_len)
    print('Vocabulary size:', len(vocab))

    ngrams = list()
    proteins = list()
    f = open('data/swissprot.fasta')
    prots, seqs = read_fasta(f.readlines())
    for protein, seq in zip(prots, seqs):
        if not is_ok(seq) or len(seq) - gram_len + 1 > MAXLEN:
            continue
        proteins.append(protein)
        grams = list()
        for i in range(len(seq) - gram_len + 1):
            grams.append(vocab[seq[i: (i + gram_len)]])
        ngrams.append(grams)
        
    df = pd.DataFrame({
        'proteins': proteins,
        'ngrams': ngrams,
    })

    def get_values(df):
        grows = []
        gcols = []
        gdata = []
        for i, row in enumerate(df.itertuples()):
            for j in range(len(row.ngrams)):
                grows.append(i)
                gcols.append(j)
                gdata.append(row.ngrams[j])
        data = sparse.csr_matrix((gdata, (grows, gcols)), shape=(len(df), MAXLEN))
        return data

    return proteins, get_values(df)
Пример #9
0
def load_data(split=0.9):

    ngrams = list()
    df = pd.read_pickle('data/sw_scores.pkl')
    prot_index = {}
    for row in df.itertuples():
        seq = row.sequences
        if not is_ok(seq) or len(seq) > MAXLEN:
            continue
        grams = list(map(lambda x: AAINDEX[x], seq))
        ngrams.append(grams)
        prot_index[row.proteins] = len(prot_index)

    df['ngrams'] = ngrams
    n = len(df)
    index = np.arange(n)
    np.random.seed(seed=0)
    np.random.shuffle(index)
    train_n = int(n * split)
    valid_n = int(train_n * split)
    train_df = df.iloc[index[:valid_n]]
    valid_df = df.iloc[index[valid_n:train_n]]
    test_df = df.iloc[index[train_n:]]

    def get_values(df):
        index = np.zeros((len(df), ), dtype=np.int32)
        data = np.zeros((len(df), MAXLEN, 21), dtype=np.float32)
        for i, row in enumerate(df.itertuples()):
            for j in range(len(row.ngrams)):
                data[i, j, row.ngrams[j]] = 1
            index[i] = prot_index[row.proteins]
        scores = df['scores'].values
        for i in range(len(scores)):
            scores[i] = scores[i][index]
        return data, scores

    train, valid, test = get_values(train_df), get_values(
        valid_df), get_values(test_df)

    return train, valid, test
Пример #10
0
def to_pickle_org(org='mouse'):
    proteins = list()
    accessions = list()
    sequences = list()
    length = list()
    status = list()
    ngrams = list()

    ngram_df = pd.read_pickle(DATA_ROOT + 'ngrams.pkl')
    vocab = {}
    for key, gram in enumerate(ngram_df['ngrams']):
        vocab[gram] = key + 1
    gram_len = len(ngram_df['ngrams'][0])
    print('Gram length:', gram_len)
    print('Vocabulary size:', len(vocab))

    with gzip.open(DATA_ROOT + 'uniprot-' + org + '.tab.gz') as f:
        next(f)
        for line in f:
            items = line.strip().split('\t')
            seq = items[2]
            if not is_ok(seq):
                continue
            proteins.append(items[1])
            accessions.append(items[0])
            sequences.append(seq)
            length.append(int(items[3]))
            status.append(items[4])
            grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
            for i in xrange(len(seq) - gram_len + 1):
                grams[i] = vocab[seq[i:(i + gram_len)]]
            ngrams.append(grams)

    # with open('data/cafa3/tremble_data.tab') as f:
    #     for line in f:
    #         items = line.strip().split('\t')
    #         if items[0] not in prots:
    #             prots.add(items[0])
    #             proteins.append(items[0])
    #             accessions.append(items[1])
    #             sequences.append(items[2])

    # with open('data/cafa3/uniprot_trembl.tab') as f:
    #     for line in f:
    #         items = line.strip().split('\t')
    #         if items[1] not in prots:
    #             proteins.append(items[1])
    #             accessions.append(items[0])
    #             sequences.append(items[2])
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'length': length,
        'status': status,
        'ngrams': ngrams
    })
    print(len(df))
    df.to_pickle(DATA_ROOT + org + '-sequences.pkl')
    # Filter reviewed
    df = df[df['status'] == 'reviewed']
    print(len(df))
    print('Loading embeddings')
    rep_df = pd.read_pickle('data/graph_new_embeddings.pkl')
    embeds = {}
    for i, row in rep_df.iterrows():
        embeds[row['accessions']] = row['embeddings']
    df = pd.merge(df, rep_df, on='accessions', how='left')

    p = Popen([
        'blastp', '-db', 'data/embeddings.fa', '-max_target_seqs', '1',
        '-num_threads', '128', '-outfmt', '6 qseqid sseqid'
    ],
              stdin=PIPE,
              stdout=PIPE)
    missing_rep = 0
    for i, row in df.iterrows():
        if not isinstance(row['embeddings'], np.ndarray):
            p.stdin.write('>' + row['accessions'] + '\n' + row['sequences'] +
                          '\n')
            missing_rep += 1
    print('Starting blastp for %d' % missing_rep)

    p.stdin.close()
    embed_map = {}
    if p.wait() == 0:
        for line in p.stdout:
            print(line)
            it = line.strip().split('\t')
            embed_map[it[0]] = it[1]
    missing_rep = 0
    for i, row in df.iterrows():
        if not isinstance(row['embeddings'], np.ndarray):
            if row['accessions'] in embed_map:
                df.at[i, 'embeddings'] = embeds[embed_map[row['accessions']]]
            else:
                df.at[i, 'embeddings'] = np.zeros((256, ), dtype=np.float32)
                missing_rep += 1
    print('Missing reps: ', missing_rep)

    df.to_pickle(DATA_ROOT + org + '-data.pkl')