def main(): args = parse_args() ##################### # START CODING HERE # ##################### # Uncomment and complete (i.e. replace '?' in) the lines below: N = args.sequences A = load_tsv(args.transition) E = load_tsv(args.emission) out_file = args.out_file with open(out_file, 'w') as f: for i in range(N): seq = generate_sequence(A, E) f.write('>random_sequence_%i\n%s\n' % (i, seq)) print(seq)
def main(args=False): "Perform Viterbi training, given a set of sequences with A and E priors." # Process arguments and load specified files if not args: args = parse_args() set_X, labels = load_fasta(args.fasta) # List of sequences, list of labels A = load_tsv(args.transition) # Nested Q -> Q dictionary E = load_tsv(args.emission) # Nested Q -> S dictionary i_max = args.max_iter ##################### # START CODING HERE # ##################### # Iterate until you've reached i_max or until your parameters have converged! # Note Viterbi converges discretely (unlike Baum-Welch), so you don't need to # track your Sum Log-Likelihood to decide this. i = 0 while i <= i_max: i += 1 A, E = train_viterbi(set_X, A, E) print('========================================\n') print_params(A, E) ##################### # END CODING HERE # ##################### if args.out_dir: makedirs(args.out_dir, exist_ok=True) # Make sure the output directory exists. A_path = op.join(args.out_dir, 'viterbi_posterior_A') with open(A_path, 'w') as f: f.write(serialize(A)) E_path = op.join(args.out_dir, 'viterbi_posterior_E') with open(E_path, 'w') as f: f.write(serialize(E))
def main(args=False): "Perform the specified algorithm, for a given set of sequences and parameters." # Process arguments and load specified files if not args: args = parse_args() cmd = args.command # viterbi, forward, backward or baumwelch verbosity = args.verbosity set_X, labels = load_fasta(args.fasta) # List of sequences, list of labels A = load_tsv(args.transition) # Nested Q -> Q dictionary E = load_tsv(args.emission) # Nested Q -> S dictionary def save(filename, contents): if args.out_dir: makedirs(args.out_dir, exist_ok=True) # Make sure the output directory exists. path = op.join(args.out_dir, filename) with open(path, 'w') as f: f.write(contents) # Note this function does nothing if no out_dir is specified! # VITERBI if cmd == 'viterbi': for j, X in enumerate(set_X): # For every sequence: # Calculate the most probable state path, with the corresponding probability and matrix Q, P, T = viterbi(X, A, E) # Save and/or print relevant output label = labels[j] save('%s.path' % label, Q) save('%s.matrix' % label, serialize(T, X)) save('%s.p' % label, '%1.2e' % P) print('>%s\n Path = %s' % (label, Q)) if verbosity: print(' Seq = %s\n P = %1.2e\n' % (X, P)) if verbosity >= 2: print_trellis(T, X) # FORWARD or BACKWARD elif cmd in ['forward', 'backward']: if cmd == 'forward': algorithm = forward elif cmd == 'backward': algorithm = backward for j, X in enumerate(set_X): # For every sequence: # Calculate the Forward/Backward probability and corresponding matrix P, T = algorithm(X, A, E) # Save and/or print relevant output label = labels[j] save('%s.matrix' % label, serialize(T, X)) save('%s.p' % label, '%1.2e' % P) if verbosity >= 2: print('\n>%s\n P = %1.2e\n' % (label, P)) print_trellis(T, X) elif verbosity: print('>%-10s\tP = %1.2e' % (label, P)) # BAUM-WELCH TRAINING elif cmd == 'baumwelch': # Initialize i = 1 i_max = args.max_iter threshold = args.conv_thresh current_SLL, A, E = baumwelch(set_X, A, E) if verbosity: print('Iteration %i, prior SLL = %1.2e' % (i, current_SLL)) if verbosity >= 2: print_params(A, E) last_SLL = current_SLL - threshold - 1 # Iterate at least once # Iterate until convergence or limit while i < i_max and current_SLL - last_SLL > threshold: i += 1 last_SLL = current_SLL # Calculate the Sum Log-Likelihood of X given A and E, # and update the estimates (posteriors) for A and E. current_SLL, A, E = baumwelch(set_X, A, E) if verbosity: print('Iteration %i, prior SLL = %1.2e' % (i, current_SLL)) if verbosity >= 2: print_params(A, E) converged = current_SLL - last_SLL <= threshold final_SLL = sum([log10(forward(X, A, E)[0]) for X in set_X]) # Save and/or print relevant output save('SLL', '%1.2e\t%i\t%s' % (final_SLL, i, converged)) save('posterior_A', serialize(A)) save('posterior_E', serialize(E)) if verbosity: print('========================================\n') if converged: print('Converged after %i iterations.' % i) else: print('Failed to converge after %i iterations.' % i_max) if verbosity: print('Final SLL: %1.2e' % final_SLL) print('Final parameters:') print_params(A, E)