def do_multiple_sequence_alignments(manager, root_node, msa_inputs): alignments = [] for msa_input in msa_inputs: args, env, seqs, track_id_sets, score_matrices = msa_input sub_env = Environment(parent=env) sub_env.keys['squash_profiles'] = True if args.tree_file is None: # Build guide tree component = GuideTreeBuilder execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.environment(sub_env) task.inputs(sequences=seqs, track_id_sets=track_id_sets, score_matrices=score_matrices) outputs = run(execution, verbose=False, root_node=root_node)[0] guide_tree = outputs['guide_tree'] else: # Read guide tree and convert it into PRALINE format with open_resource(args.tree_file, 'trees') as f: tree = get_tree(f.read()) labels = [seq.name.split(':')[0].lower() for seq in seqs] d = np.zeros((len(labels), len(labels)), dtype=np.float32) for n, label_one in enumerate(labels): for m, label_two in enumerate(labels): if n == m: continue d[n, m] = tree_distance(tree, label_one, label_two) hc = HierarchicalClusteringAlgorithm(d) guide_tree = SequenceTree(seqs, list(hc.merge_order('average'))) # Build MSA component = TreeMultipleSequenceAligner execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.environment(env) task.inputs(sequences=seqs, guide_tree=guide_tree, track_id_sets=track_id_sets, score_matrices=score_matrices) outputs = run(execution, verbose=False, root_node=root_node)[0] alignments.append(outputs['alignment']) return alignments
def do_multiple_sequence_alignments(manager, root_node, msa_inputs): start = time.time() msa_execution = Execution(manager, ROOT_TAG) for msa_input in msa_inputs: args, env, seqs, track_id_sets, score_matrices = msa_input sub_env = Environment(parent=env) sub_env.keys['squash_profiles'] = True if args.tree_file is None: # Build guide tree component = GuideTreeBuilder execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.environment(sub_env) task.inputs(sequences=seqs, track_id_sets=track_id_sets, score_matrices=score_matrices) outputs = run(execution, verbose=False, root_node=root_node)[0] guide_tree = outputs['guide_tree'] else: # Read guide tree and convert it into PRALINE format with open_resource(args.tree_file, 'trees') as f: tree = get_tree(f.read()) labels = [get_name(seq.name) for seq in seqs] d = np.zeros((len(labels), len(labels)), dtype=np.float32) for n, label_one in enumerate(labels): for m, label_two in enumerate(labels): if n == m: continue d[n, m] = tree_distance(tree, label_one, label_two) hc = HierarchicalClusteringAlgorithm(d) guide_tree = SequenceTree(seqs, list(hc.merge_order('average'))) # Build MSA component = TreeMultipleSequenceAligner task = msa_execution.add_task(component) task.environment(env) task.inputs(sequences=seqs, guide_tree=guide_tree, track_id_sets=track_id_sets, score_matrices=score_matrices) end = time.time() print "Preparing : " + (str(end - start)) + " seconds" outputs = run(msa_execution, verbose=False, root_node=root_node) alignments = [o['alignment'] for o in outputs] return alignments
def do_preprofiles(args, env, manager, alignments, seqs, verbose, root_node): for i, alignment in enumerate(alignments): component = ProfileBuilder execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.environment(env) task.inputs(alignment=alignment, track_id=TRACK_ID_INPUT) outputs = run(execution, verbose=verbose, root_node=root_node)[0] track = outputs['profile_track'] seqs[i].add_track(TRACK_ID_PREPROFILE, track)
def do_motif_annotation(args, env, manager, seqs, verbose, root_node): FMT_TRACK_ID = "{0}_{1}" track_scores = {} execution = Execution(manager, ROOT_TAG) seq_patterns = [] for seq in seqs: for pair in args.patterns: pattern = pair[0] if len(pair) > 1: score = float(pair[1]) else: score = None seq_patterns.append((seq, pattern, score)) component = PrositePatternAnnotator task = execution.add_task(component) task.environment(env) task.inputs(sequence=seq, pattern=pattern, track_id=TRACK_ID_INPUT) outputs = run(execution, verbose=verbose, root_node=root_node) for n, output in enumerate(outputs): seq, pattern, score = seq_patterns[n] track = output['prediction_track'] trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_PATTERN, pattern) seq.add_track(trid, track) track_scores[trid] = score for pair in args.annotation_files: annotation_file = pair[0] if len(pair) > 1: score = float(pair[1]) else: score = None annotation_seqs = load_sequence_fasta(annotation_file, ALPHABET_PROSITE) name_tracks = {} for annotation_seq in annotation_seqs: track = annotation_seq.get_track(TRACK_ID_INPUT) name_tracks[annotation_seq.name] = track for seq in seqs: track = name_tracks[seq.name] trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_FILE, annotation_file) seq.add_track(trid, track) track_scores[trid] = score return track_scores
def do_multiple_sequence_alignment(args, env, manager, seqs, track_id_sets, score_matrices, verbose, root_node): if args.pregen_tree: # Dummy preprofiles, so we can safely align by sequence. sub_env = Environment(parent=env) if not args.preprofile_local and not args.preprofile_global: sub_env.keys['squash_profiles'] = True # Build guide tree component = GuideTreeBuilder execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.environment(sub_env) task.inputs(sequences=seqs, track_id_sets=track_id_sets, score_matrices=score_matrices) outputs = run(execution, verbose=verbose, root_node=root_node)[0] # Build MSA component = TreeMultipleSequenceAligner execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.environment(env) task.inputs(sequences=seqs, guide_tree=outputs['guide_tree'], track_id_sets=track_id_sets, score_matrices=score_matrices) outputs = run(execution, verbose=verbose, root_node=root_node)[0] else: component = AdHocMultipleSequenceAligner execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.environment(env) task.inputs(sequences=seqs, track_id_sets=track_id_sets, score_matrices=score_matrices) outputs = run(execution, verbose=verbose, root_node=root_node)[0] return outputs['alignment']
def do_master_slave_alignments(args, env, manager, seqs, track_id_sets, score_matrices, verbose, root_node): execution = Execution(manager, ROOT_TAG) master_slave_alignments = [None for seq in seqs] for master_seq, slave_seqs in seqs: component = DummyMasterSlaveAligner task = execution.add_task(component) task.environment(env) task.inputs(master_sequence=master_seq, slave_sequences=slave_seqs, track_id_sets=track_id_sets, score_matrices=score_matrices) outputs = run(execution, verbose=verbose, root_node=root_node) for n, output in enumerate(outputs): master_slave_alignments[n] = output['alignment'] return master_slave_alignments
def main(): # Parse arguments. args = parse_args() verbose = not args.quiet or args.verbose # Setup the execution manager. index = TypeIndex() index.autoregister() if args.remote: if args.remote_secret is None: secret = "__MUCH_SECRITY__" else: with open(args.remote_secret, 'r') as f: secret = f.readline() manager = RemoteManager(index, args.remote_host, args.remote_port, secret) elif args.num_threads > 1: manager = ParallelExecutionManager(index, args.num_threads - 1) else: manager = Manager(index) # Register manager cleanup code at exit. atexit.register(_atexit_close_manager, manager=manager) # Load inputs and other data. with open_resource(args.score_matrix, "matrices") as f: score_matrix = load_score_matrix(f, alphabet=ALPHABET_AA) seqs = load_sequence_fasta(args.input, ALPHABET_AA) gap_series = [-float(x) for x in args.gap_penalties.split(",")] # Setup environment. keys = {} keys['gap_series'] = gap_series keys['db_name'] = args.psi_blast_db keys['num_seqs'] = args.psi_blast_num keys['max_evalue'] = args.psi_blast_evalue keys['profile_evalue'] = args.psi_blast_inclusion keys['num_iterations'] = args.psi_blast_iters keys['score_threshold'] = args.preprofile_score keys['linkage_method'] = args.tree_linkage keys['waterman_eggert_iterations'] = args.num_preprofile_alignments keys['aligner'] = PairwiseAligner.tid keys['debug'] = args.debug if args.merge_semiglobal_auto: keys['merge_mode'] = 'semiglobal_auto' elif args.merge_semiglobal: keys['merge_mode'] = 'semiglobal' else: keys['merge_mode'] = 'global' if args.dist_semiglobal_auto: keys['dist_mode'] = 'semiglobal_auto' elif args.dist_semiglobal: keys['dist_mode'] = 'semiglobal' else: keys['dist_mode'] = 'global' if args.pregen_tree: keys['msa_mode'] = 'tree' else: keys['msa_mode'] = 'ad_hoc' if args.preprofile_global: keys['preprofile_mode'] = 'global' elif args.preprofile_local: keys['preprofile_mode'] = 'local' else: keys['preprofile_mode'] = 'dummy' if args.psi_blast: keys['run_psi_blast'] = True if args.no_accelerate: keys['accelerate'] = False else: keys['accelerate'] = True try: keys['blast_plus_root'] = os.environ['BLAST_PLUS_ROOT'] except KeyError: pass env = Environment(keys=keys) # Initialize root node for output root_node = TaskNode(ROOT_TAG) # Run the PRALINE MSA workflow component = PralineMultipleSequenceAlignmentWorkflow execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.inputs(sequences=seqs, score_matrix=score_matrix) task.environment(env) outputs = run(execution, verbose, root_node)[0] alignment = outputs['alignment'] # Write alignment to output file. outfmt = args.output_format if outfmt == 'fasta': write_alignment_fasta(args.output, alignment, TRACK_ID_INPUT) elif outfmt == "clustal": write_alignment_clustal(args.output, alignment, TRACK_ID_INPUT, score_matrix) else: raise DataError("unknown output format: '{0}'".format(outfmt)) if verbose: sys.stdout.write('\n') # Collect log bundles if args.debug > 0: write_log_structure(root_node)