예제 #1
0
# matrixEZ = numpy.array(matrix_from_exa('new_tts.exa'))
taa_matrix, tga_matrix, tag_matrix = stop_divider('new_tts.exa')

matrixDonor0 = numpy.array(matrix_from_exa('new_donor0.exa'))
matrixDonor1 = numpy.array(matrix_from_exa('new_donor1.exa'))
matrixDonor2 = numpy.array(matrix_from_exa('new_donor2.exa'))
matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor0.exa'))
matrixAcceptor1 = numpy.array(matrix_from_exa('new_acceptor1.exa'))
matrixAcceptor2 = numpy.array(matrix_from_exa('new_acceptor2.exa'))

polyASeqs = [('AATAAA', 592), ('ATTAAA', 149), ('AGTAAA', 27), ('TATAAA', 32),
             ('CATAAA', 13), ('GATAAA', 13), ('AATATA', 17), ('AATACA', 12),
             ('AATAGA', 7), ('ACTAAA', 6), ('AAGAAA', 11), ('AATGAA', 8)]

matrixPolyA = numpy.array(percentage_matrix_maker(polyASeqs))
poly_a_signal_data = classify(matrixPolyA, 2)
poly_a_states = sequence_state_factory(poly_a_signal_data, 'poly a zone ')

utr_exon_probs = calculator.utr_exon_3('mcuts.txt').p

exon3_state = State(DiscreteDistribution(utr_exon_probs), name='3utr exon')
post_poly_spacer = spacer_states_maker(15, utr_exon_probs, 'post_poly_spacer')

ze_states_data = classify(matrixZE, 2)
ze_states = sequence_state_factory(ze_states_data, 'start zone')

ez_states_taa_data = classify(numpy.array(taa_matrix), 2)
ez_states_taa = sequence_state_factory(ez_states_taa_data, 'stop zone taa')

ez_states_tga_data = classify(numpy.array(tga_matrix), 2)
ez_states_tga = sequence_state_factory(ez_states_tga_data, 'stop zone tga')
import numpy
from pomegranate import State
from pomegranate import DiscreteDistribution
from pomegranate import HiddenMarkovModel
import calculator
from converter_to import converter_to
from model_maker_utils import sequence_state_factory
from model_maker_utils import classify
from model_maker_utils import add_sequence
from model_maker_utils import equal_distribution
from matrix_from_aln import matrix_from_exa

matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor1.exa'))
acceptor0_data = classify(matrixAcceptor0, 2)

model = HiddenMarkovModel('intron_acceptor')

intron = State(DiscreteDistribution(
    calculator.intron_calculator('cuts_intron.txt').p),
               name='in')
acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0')
post = State(DiscreteDistribution(equal_distribution), name='post')

model.add_state(intron)
add_sequence(model, acceptor0_states)
model.add_state(post)

model.add_transition(model.start, intron, 1)
model.add_transition(intron, intron, 0.9)
model.add_transition(intron, acceptor0_states[0], 0.1)
model.add_transition(acceptor0_states[-1], post, 1)
예제 #3
0
def train_and_test():
    with open('../data extractors/exons_start_1.txt') as in_file:
        total = []
        for line in in_file:
            no_p_line = line.replace('P', '').lower().replace('\n', '')
            total.append(no_p_line)

    converted_total = [converter_to(x, 2) for x in total]

    matrixDonor0 = numpy.array(
        matrix_from_exa('../data extractors/new_donor1.exa'))

    c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt')
    print(c0.p, c1.p, c2.p)
    coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
    coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
    coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

    donor0_data = classify(matrixDonor0, 2)
    donor0_states = sequence_state_factory(donor0_data, 'donor0')

    post = State(DiscreteDistribution(equal_distribution), name='post')

    model = HiddenMarkovModel('coding to donor')

    model.add_state(coding_state0)
    model.add_state(coding_state1)
    model.add_state(coding_state2)

    add_sequence(model, donor0_states)

    model.add_state(post)

    model.add_transition(model.start, coding_state0, 1)

    model.add_transition(coding_state0, coding_state1, 0.6)
    model.add_transition(coding_state0, donor0_states[0], 0.4)

    model.add_transition(coding_state1, coding_state2, 0.6)
    model.add_transition(coding_state1, donor0_states[0], 0.4)

    model.add_transition(coding_state2, coding_state0, 0.6)
    model.add_transition(coding_state2, donor0_states[0], 0.4)

    model.add_transition(donor0_states[-1], post, 1)

    model.add_transition(post, post, 0.9)
    model.add_transition(post, model.end, 0.1)

    model.bake()
    test_model(model)

    model.fit(converted_total,
              transition_pseudocount=1,
              emission_pseudocount=1,
              verbose=True)

    test_model(model)

    with open('partial_model_coding_to_donor_model0.json', 'w') as out:
        out.write(model.to_json())
예제 #4
0
import calculator
from model_maker_utils import sequence_state_factory, classify, add_sequence, equal_distribution
from matrix_from_aln import matrix_from_exa
from converter_to import converter_to

c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt')
matrixStop = numpy.array(matrix_from_exa('../data extractors/new_stops.exa'))
coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

post = State(DiscreteDistribution(equal_distribution), name='post')

model = HiddenMarkovModel('coding_to_stop')

stop_data = classify(matrixStop, 2)
stop_states = sequence_state_factory(stop_data, 'stop')

model.add_state(coding_state0)
model.add_state(coding_state1)
model.add_state(coding_state2)

add_sequence(model, stop_states)

model.add_state(post)

model.add_transition(model.start, coding_state1, 1)
model.add_transition(coding_state0, coding_state1, 1)
model.add_transition(coding_state1, coding_state2, 1)
model.add_transition(coding_state2, coding_state0, 0.6)
model.add_transition(coding_state2, stop_states[0], 0.4)
예제 #5
0
            test_line = x_line.lower().replace('\n', '').replace(' ', '')
            tonight = converter_to(test_line, 2)
            logp, path = model.viterbi(tonight)
            path = [x[1].name for i, x in enumerate(path) if i < len(tonight)]
            if path[48] == 'start zone7':
                oks += 1
            else:
                not_ok += 1
        print(oks / (oks + not_ok))


back = State(DiscreteDistribution(equal_distribution), name='back')
back2 = State(DiscreteDistribution(equal_distribution), name='back2')

matrixZE = numpy.array(matrix_from_exa('../data extractors/starts.exa'))
start_states_data = classify(matrixZE, 2)
start_states = sequence_state_factory(start_states_data, 'start zone')

model = HiddenMarkovModel()

model.add_state(back)
model.add_state(back2)
add_sequence(model, start_states)

model.add_transition(model.start, back, 1)
model.add_transition(back, back, 0.55)
model.add_transition(back, start_states[0], 0.45)
model.add_transition(start_states[-1], back2, 1)
model.add_transition(back2, back2, 0.5)

model.bake()
예제 #6
0
from model_maker_utils import add_variable_length_sequence
from model_maker_utils import load_long_training_examples
from converter_to import converter_to
import calculator
from pomegranate import State
from pomegranate import HiddenMarkovModel
from pomegranate import DiscreteDistribution


matrix_TATA = numpy.array(matrix_from_fasta('tata_-5_11_completa.seq'))
matrix_GC = numpy.array(matrix_from_fasta('gc_completo.seq'))
matrix_CCAAT = numpy.array(matrix_from_fasta('CCAAT_completa.seq'))
matrix_Inr = numpy.array(matrix_from_fasta('Inr_completo.seq'))
matrix_no_inr = numpy.array(matrix_from_fasta('no_inr.fa'))

gc_data = classify(matrix_GC, 2)
tata_data = classify(matrix_TATA, 2)
cat_data = classify(matrix_CCAAT, 2)
inr_data = classify(matrix_Inr, 2)
no_inr_data = classify(matrix_no_inr, 2)

no_coding = calculator.intron_calculator('cuts_intron.txt')


# Model
promoter_utr_model = HiddenMarkovModel('promoter')

# States
back = State(DiscreteDistribution(no_coding.p), name='back')

gc_states = sequence_state_factory(gc_data, 'GC')
    total = []
    for line in in_file:
        no_p_line = line.replace('P', '').lower().replace('\n', '')
        total.append(no_p_line)

converted_total = [converter_to(x, 2) for x in total]

matrixDonor0 = numpy.array(matrix_from_exa('new_donor1.exa'))

c0, c1, c2 = calculator.calculate_proba2('cuts.txt')

coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

donor0_data = classify(matrixDonor0, 2)
donor0_states = sequence_state_factory(donor0_data, 'donor0')

post = State(DiscreteDistribution(equal_distribution), name='post')

model = HiddenMarkovModel('codiing to donor')

model.add_state(coding_state0)
model.add_state(coding_state1)
model.add_state(coding_state2)

add_sequence(model, donor0_states)

model.add_state(post)

model.add_transition(model.start, coding_state0, 1)