Пример #1
0
def load_data(parent_id, go_id):
    data = list()
    labels = list()
    positive = list()
    negative = list()
    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            if label == 1:
                labels.append(1)
                positive.append(seq)
            else:
                labels.append(0)
                negative.append(seq)
    shuffle(negative, seed=0)
    n = len(positive)
    negative = negative[:n]
    n = len(positive)
    labels = [0] * len(negative) + [1] * len(positive)
    data = negative + positive
    for i in range(len(data)):
        data[i] = encode_seq_one_hot(data[i], maxlen=MAXLEN)
    shuffle(data, labels, seed=0)
    return numpy.array(labels), numpy.array(data, dtype='float32')
Пример #2
0
def load_data(go_id):
    positive1 = list()
    positive2 = list()
    negative1 = list()
    negative2 = list()
    with open(DATA_ROOT + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            hydro = encode_seq_hydro(seq, maxlen=MAXLEN)
            seq = encode_seq_one_hot(seq, maxlen=MAXLEN)
            if label == 1:
                positive1.append(seq)
                positive2.append(hydro)
            else:
                negative1.append(seq)
                negative2.append(hydro)
    shuffle(negative1, negative2, seed=0)
    n = len(positive1)
    data1 = negative1[:n] + positive1
    data2 = negative2[:n] + positive2
    labels = [0] * len(negative1) + [1] * len(positive1)
    shuffle(data1, data2, labels, seed=0)
    data = (numpy.array(data1,
                        dtype='float32'), numpy.array(data2, dtype='float32'))
    return (numpy.array(labels, dtype='float32'), data)
Пример #3
0
def load_data(go_id):
    positive1 = list()
    positive2 = list()
    negative1 = list()
    negative2 = list()
    with open(DATA_ROOT + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            hydro = encode_seq_hydro(seq, maxlen=MAXLEN)
            seq = encode_seq_one_hot(seq, maxlen=MAXLEN)
            if label == 1:
                positive1.append(seq)
                positive2.append(hydro)
            else:
                negative1.append(seq)
                negative2.append(hydro)
    shuffle(negative1, negative2, seed=0)
    n = len(positive1)
    data1 = negative1[:n] + positive1
    data2 = negative2[:n] + positive2
    labels = [0] * len(negative1) + [1] * len(positive1)
    shuffle(data1, data2, labels, seed=0)
    data = (
        numpy.array(data1, dtype='float32'),
        numpy.array(data2, dtype='float32'))
    return (
        numpy.array(labels, dtype='float32'),
        data)
Пример #4
0
def load_data(parent_id, go_id):
    data = list()
    labels = list()
    global nb_classes
    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split('\t')
            seq = line[1][:MAXLEN]
            labs = line[2].split('|')
            data.append(seq)
            for i in range(len(labs)):
                labs[i] = int(labs[i])
                nb_classes = max(nb_classes, labs[i])
            labels.append(labs)
    nb_classes += 1
    for i in range(len(labels)):
        l = [0] * nb_classes
        for x in labels[i]:
            l[x] = 1
        labels[i] = l
    for i in range(len(data)):
        data[i] = encode_seq_one_hot(data[i], maxlen=MAXLEN)
    shuffle(data, labels, seed=0)
    return numpy.array(
        labels, dtype='float32'), numpy.array(data, dtype='float32')
Пример #5
0
def load_data(parent_id, go_id):
    data = list()
    labels = list()
    positive = list()
    negative = list()
    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            if label == 1:
                labels.append(1)
                positive.append(seq)
            else:
                labels.append(0)
                negative.append(seq)
    shuffle(negative, seed=0)
    n = len(positive)
    negative = negative[:n]
    n = len(positive)
    labels = [0] * len(negative) + [1] * len(positive)
    data = negative + positive
    for i in range(len(data)):
        data[i] = encode_seq_one_hot(data[i], maxlen=MAXLEN)
    shuffle(data, labels, seed=0)
    return numpy.array(labels), numpy.array(data, dtype='float32')
def predict_functions(classifier, seq):
    q = deque()
    q.append(classifier)
    functions = list()
    data = numpy.array([encode_seq_one_hot(seq, maxlen=MAXLEN)])
    while len(q) > 0:
        x = q.popleft()
        ok = True
        for ch_id in x['children']:
            if 'model' in go[ch_id] and go[ch_id]['model']:
                model = go[ch_id]['model']
                pred = model.predict_classes(data, batch_size=1, verbose=0)
                if pred[0][0] == 1:
                    ok = False
                    q.append(go[ch_id])
        if ok:
            functions.append(x['id'])

    return functions
def predict_functions(classifier, seq):
    q = deque()
    q.append(classifier)
    functions = list()
    data = numpy.array([encode_seq_one_hot(seq, maxlen=MAXLEN)])
    while len(q) > 0:
        x = q.popleft()
        ok = True
        for ch_id in x['children']:
            if 'model' in go[ch_id] and go[ch_id]['model']:
                model = go[ch_id]['model']
                pred = model.predict_classes(
                    data,
                    batch_size=1,
                    verbose=0)
                if pred[0][0] == 1:
                    ok = False
                    q.append(go[ch_id])
        if ok:
            functions.append(x['id'])

    return functions
Пример #8
0
def load_data(parent_id, go_id):
    data1 = list()
    data2 = list()
    labels = list()
    positive1 = list()
    negative1 = list()
    positive2 = list()
    negative2 = list()

    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            sq1 = encode_seq_one_hot(seq, maxlen=MAXLEN)
            sq2 = encode_seq(OGAK980101, seq, maxlen=MAXLEN)
            sq3 = encode_seq(MEHP950102, seq, maxlen=MAXLEN)
            sq4 = encode_seq(CROG050101, seq, maxlen=MAXLEN)
            sq5 = encode_seq(TOBD000101, seq, maxlen=MAXLEN)
            sq6 = encode_seq(ALTS910101, seq, maxlen=MAXLEN)
            if label == 1:
                positive1.append([sq1])
                positive2.append(sq1)
            else:
                negative1.append([sq1])
                negative2.append(sq1)
    shuffle(negative1, negative2, seed=0)
    n = min(len(positive1), len(negative1))
    data1 = negative1[:n] + positive1[:n]
    data2 = negative2[:n] + positive2[:n]
    labels = [0.0] * n + [1.0] * n
    # Previous was 30
    shuffle(data1, data2, labels, seed=0)
    data = (
        numpy.array(data1, dtype='float32'),
        numpy.array(data2, dtype='float32'))
    return (numpy.array(labels), data)
Пример #9
0
def load_data(parent_id, go_id):
    data1 = list()
    data2 = list()
    labels = list()
    positive1 = list()
    negative1 = list()
    positive2 = list()
    negative2 = list()

    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            sq1 = encode_seq_one_hot(seq, maxlen=MAXLEN)
            sq2 = encode_seq(OGAK980101, seq, maxlen=MAXLEN)
            sq3 = encode_seq(MEHP950102, seq, maxlen=MAXLEN)
            sq4 = encode_seq(CROG050101, seq, maxlen=MAXLEN)
            sq5 = encode_seq(TOBD000101, seq, maxlen=MAXLEN)
            sq6 = encode_seq(ALTS910101, seq, maxlen=MAXLEN)
            if label == 1:
                positive1.append([sq1])
                positive2.append(sq1)
            else:
                negative1.append([sq1])
                negative2.append(sq1)
    shuffle(negative1, negative2, seed=0)
    n = min(len(positive1), len(negative1))
    data1 = negative1[:n] + positive1[:n]
    data2 = negative2[:n] + positive2[:n]
    labels = [0.0] * n + [1.0] * n
    # Previous was 30
    shuffle(data1, data2, labels, seed=0)
    data = (numpy.array(data1,
                        dtype='float32'), numpy.array(data2, dtype='float32'))
    return (numpy.array(labels), data)
def load_data(go_id):
    data1 = list()
    data2 = list()
    labels = list()
    positive1 = list()
    negative1 = list()
    positive2 = list()
    negative2 = list()

    with open(DATA_ROOT + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            sq1 = encode_seq_one_hot(seq, maxlen=MAXLEN)
            sq2 = list()
            for l in seq:
                sq2.append(MEHP950102[AAINDEX[l]])
            while len(sq2) < MAXLEN:
                sq2.append([0.0] * 20)
            if label == 1:
                positive1.append(sq1)
                positive2.append(sq2)
            else:
                negative1.append(sq1)
                negative2.append(sq2)
    shuffle(negative1, negative2)
    n = min(len(positive1), len(negative1))
    data1 = negative1[:n] + positive1[:n]
    data2 = negative2[:n] + positive2[:n]
    labels = [0.0] * n + [1.0] * n
    # Previous was 30
    shuffle(data1, data2, labels)
    return (numpy.array(labels), numpy.array(data1, dtype='float32'),
            numpy.array(data2,
                        dtype='float32'), numpy.array(data1, dtype='float32'))
Пример #11
0
def load_data(go_id):
    data = list()
    labels = list()
    pos = 1
    positive = list()
    negative = list()
    with open(DATA_ROOT + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = []
            seq = encode_seq_one_hot(line[2][:500], maxlen=MAXLEN)

            if label == pos:
                positive.append(seq)
            else:
                negative.append(seq)
    shuffle(negative, seed=0)
    n = len(positive)
    data = negative[:n] + positive
    labels = [0.0] * n + [1.0] * n
    # Previous was 30
    shuffle(data, labels, seed=0)
    return numpy.array(labels), numpy.array(data, dtype="float32")