예제 #1
0
def main():
    args = parse_args()
    #####################
    # START CODING HERE #
    #####################
    # Uncomment and complete (i.e. replace '?' in) the lines below:

    N = args.sequences
    A = load_tsv(args.transition)
    E = load_tsv(args.emission)
    out_file = args.out_file

    with open(out_file, 'w') as f:
        for i in range(N):
            seq = generate_sequence(A, E)
            f.write('>random_sequence_%i\n%s\n' % (i, seq))
            print(seq)
예제 #2
0
def main(args=False):
    "Perform Viterbi training, given a set of sequences with A and E priors."

    # Process arguments and load specified files
    if not args: args = parse_args()

    set_X, labels = load_fasta(args.fasta)  # List of sequences, list of labels
    A = load_tsv(args.transition)  # Nested Q -> Q dictionary
    E = load_tsv(args.emission)  # Nested Q -> S dictionary

    i_max = args.max_iter

    #####################
    # START CODING HERE #
    #####################
    # Iterate until you've reached i_max or until your parameters have converged!
    # Note Viterbi converges discretely (unlike Baum-Welch), so you don't need to
    # track your Sum Log-Likelihood to decide this.
    i = 0
    while i <= i_max:
        i += 1
        A, E = train_viterbi(set_X, A, E)

    print('========================================\n')
    print_params(A, E)
    #####################
    #  END CODING HERE  #
    #####################

    if args.out_dir:
        makedirs(args.out_dir,
                 exist_ok=True)  # Make sure the output directory exists.
        A_path = op.join(args.out_dir, 'viterbi_posterior_A')
        with open(A_path, 'w') as f:
            f.write(serialize(A))
        E_path = op.join(args.out_dir, 'viterbi_posterior_E')
        with open(E_path, 'w') as f:
            f.write(serialize(E))
예제 #3
0
def main(args=False):
    "Perform the specified algorithm, for a given set of sequences and parameters."

    # Process arguments and load specified files
    if not args: args = parse_args()

    cmd = args.command  # viterbi, forward, backward or baumwelch
    verbosity = args.verbosity
    set_X, labels = load_fasta(args.fasta)  # List of sequences, list of labels
    A = load_tsv(args.transition)  # Nested Q -> Q dictionary
    E = load_tsv(args.emission)  # Nested Q -> S dictionary

    def save(filename, contents):
        if args.out_dir:
            makedirs(args.out_dir,
                     exist_ok=True)  # Make sure the output directory exists.
            path = op.join(args.out_dir, filename)
            with open(path, 'w') as f:
                f.write(contents)
        # Note this function does nothing if no out_dir is specified!

    # VITERBI
    if cmd == 'viterbi':
        for j, X in enumerate(set_X):  # For every sequence:
            # Calculate the most probable state path, with the corresponding probability and matrix
            Q, P, T = viterbi(X, A, E)

            # Save and/or print relevant output
            label = labels[j]
            save('%s.path' % label, Q)
            save('%s.matrix' % label, serialize(T, X))
            save('%s.p' % label, '%1.2e' % P)
            print('>%s\n Path = %s' % (label, Q))
            if verbosity: print(' Seq  = %s\n P    = %1.2e\n' % (X, P))
            if verbosity >= 2: print_trellis(T, X)

    # FORWARD or BACKWARD
    elif cmd in ['forward', 'backward']:
        if cmd == 'forward':
            algorithm = forward
        elif cmd == 'backward':
            algorithm = backward

        for j, X in enumerate(set_X):  # For every sequence:
            # Calculate the Forward/Backward probability and corresponding matrix
            P, T = algorithm(X, A, E)

            # Save and/or print relevant output
            label = labels[j]
            save('%s.matrix' % label, serialize(T, X))
            save('%s.p' % label, '%1.2e' % P)
            if verbosity >= 2:
                print('\n>%s\n P = %1.2e\n' % (label, P))
                print_trellis(T, X)
            elif verbosity:
                print('>%-10s\tP = %1.2e' % (label, P))

    # BAUM-WELCH TRAINING
    elif cmd == 'baumwelch':
        # Initialize
        i = 1
        i_max = args.max_iter
        threshold = args.conv_thresh

        current_SLL, A, E = baumwelch(set_X, A, E)
        if verbosity:
            print('Iteration %i, prior SLL = %1.2e' % (i, current_SLL))
        if verbosity >= 2: print_params(A, E)

        last_SLL = current_SLL - threshold - 1  # Iterate at least once

        # Iterate until convergence or limit
        while i < i_max and current_SLL - last_SLL > threshold:
            i += 1
            last_SLL = current_SLL

            # Calculate the Sum Log-Likelihood of X given A and E,
            # and update the estimates (posteriors) for A and E.
            current_SLL, A, E = baumwelch(set_X, A, E)

            if verbosity:
                print('Iteration %i, prior SLL = %1.2e' % (i, current_SLL))
            if verbosity >= 2: print_params(A, E)

        converged = current_SLL - last_SLL <= threshold
        final_SLL = sum([log10(forward(X, A, E)[0]) for X in set_X])

        # Save and/or print relevant output
        save('SLL', '%1.2e\t%i\t%s' % (final_SLL, i, converged))
        save('posterior_A', serialize(A))
        save('posterior_E', serialize(E))
        if verbosity: print('========================================\n')

        if converged:
            print('Converged after %i iterations.' % i)
        else:
            print('Failed to converge after %i iterations.' % i_max)

        if verbosity:
            print('Final SLL: %1.2e' % final_SLL)
            print('Final parameters:')
            print_params(A, E)