Пример #1
0
def do_multiple_sequence_alignments(manager, root_node, msa_inputs):
    alignments = []
    for msa_input in msa_inputs:
        args, env, seqs, track_id_sets, score_matrices = msa_input
        sub_env = Environment(parent=env)
        sub_env.keys['squash_profiles'] = True

        if args.tree_file is None:
            # Build guide tree
            component = GuideTreeBuilder
            execution = Execution(manager, ROOT_TAG)
            task = execution.add_task(component)
            task.environment(sub_env)
            task.inputs(sequences=seqs,
                        track_id_sets=track_id_sets,
                        score_matrices=score_matrices)

            outputs = run(execution, verbose=False, root_node=root_node)[0]
            guide_tree = outputs['guide_tree']
        else:
            # Read guide tree and convert it into PRALINE format
            with open_resource(args.tree_file, 'trees') as f:
                tree = get_tree(f.read())

            labels = [seq.name.split(':')[0].lower() for seq in seqs]
            d = np.zeros((len(labels), len(labels)), dtype=np.float32)
            for n, label_one in enumerate(labels):
                for m, label_two in enumerate(labels):
                    if n == m:
                        continue
                    d[n, m] = tree_distance(tree, label_one, label_two)

            hc = HierarchicalClusteringAlgorithm(d)
            guide_tree = SequenceTree(seqs, list(hc.merge_order('average')))

        # Build MSA
        component = TreeMultipleSequenceAligner
        execution = Execution(manager, ROOT_TAG)
        task = execution.add_task(component)
        task.environment(env)
        task.inputs(sequences=seqs,
                    guide_tree=guide_tree,
                    track_id_sets=track_id_sets,
                    score_matrices=score_matrices)

        outputs = run(execution, verbose=False, root_node=root_node)[0]

        alignments.append(outputs['alignment'])

    return alignments
Пример #2
0
def do_multiple_sequence_alignments(manager, root_node, msa_inputs):
    start = time.time()
    msa_execution = Execution(manager, ROOT_TAG)
    for msa_input in msa_inputs:
        args, env, seqs, track_id_sets, score_matrices = msa_input
        sub_env = Environment(parent=env)
        sub_env.keys['squash_profiles'] = True

        if args.tree_file is None:
            # Build guide tree
            component = GuideTreeBuilder
            execution = Execution(manager, ROOT_TAG)
            task = execution.add_task(component)
            task.environment(sub_env)
            task.inputs(sequences=seqs, track_id_sets=track_id_sets,
                        score_matrices=score_matrices)

            outputs = run(execution, verbose=False, root_node=root_node)[0]
            guide_tree = outputs['guide_tree']
        else:
            # Read guide tree and convert it into PRALINE format
            with open_resource(args.tree_file, 'trees') as f:
                tree = get_tree(f.read())

            labels = [get_name(seq.name) for seq in seqs]
            d = np.zeros((len(labels), len(labels)), dtype=np.float32)
            for n, label_one in enumerate(labels):
                for m, label_two in enumerate(labels):
                    if n == m:
                        continue
                    d[n, m] = tree_distance(tree, label_one, label_two)

            hc = HierarchicalClusteringAlgorithm(d)
            guide_tree = SequenceTree(seqs, list(hc.merge_order('average')))

        # Build MSA
        component = TreeMultipleSequenceAligner
        task = msa_execution.add_task(component)
        task.environment(env)
        task.inputs(sequences=seqs, guide_tree=guide_tree,
                    track_id_sets=track_id_sets, score_matrices=score_matrices)
    end = time.time()
    print "Preparing : " + (str(end - start)) + " seconds"
    outputs = run(msa_execution, verbose=False, root_node=root_node)
    alignments = [o['alignment'] for o in outputs]

    return alignments
Пример #3
0
def do_preprofiles(args, env, manager, alignments, seqs, verbose, root_node):
    for i, alignment in enumerate(alignments):
        component = ProfileBuilder
        execution = Execution(manager, ROOT_TAG)
        task = execution.add_task(component)
        task.environment(env)
        task.inputs(alignment=alignment, track_id=TRACK_ID_INPUT)

        outputs = run(execution, verbose=verbose, root_node=root_node)[0]
        track = outputs['profile_track']
        seqs[i].add_track(TRACK_ID_PREPROFILE, track)
Пример #4
0
def do_motif_annotation(args, env, manager, seqs, verbose, root_node):
    FMT_TRACK_ID = "{0}_{1}"

    track_scores = {}

    execution = Execution(manager, ROOT_TAG)
    seq_patterns = []
    for seq in seqs:
        for pair in args.patterns:
            pattern = pair[0]
            if len(pair) > 1:
                score = float(pair[1])
            else:
                score = None
            seq_patterns.append((seq, pattern, score))

            component = PrositePatternAnnotator
            task = execution.add_task(component)
            task.environment(env)
            task.inputs(sequence=seq, pattern=pattern,
                        track_id=TRACK_ID_INPUT)

    outputs = run(execution, verbose=verbose, root_node=root_node)
    for n, output in enumerate(outputs):
        seq, pattern, score = seq_patterns[n]

        track = output['prediction_track']

        trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_PATTERN, pattern)
        seq.add_track(trid, track)
        track_scores[trid] = score

    for pair in args.annotation_files:
        annotation_file = pair[0]
        if len(pair) > 1:
            score = float(pair[1])
        else:
            score = None

        annotation_seqs = load_sequence_fasta(annotation_file,
                                              ALPHABET_PROSITE)
        name_tracks = {}
        for annotation_seq in annotation_seqs:
            track = annotation_seq.get_track(TRACK_ID_INPUT)
            name_tracks[annotation_seq.name] = track

        for seq in seqs:
            track = name_tracks[seq.name]

            trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_FILE, annotation_file)
            seq.add_track(trid, track)
            track_scores[trid] = score

    return track_scores
Пример #5
0
def do_preprofiles(args, env, manager, alignments, seqs, verbose, root_node):
    for i, alignment in enumerate(alignments):
        component = ProfileBuilder
        execution = Execution(manager, ROOT_TAG)
        task = execution.add_task(component)
        task.environment(env)
        task.inputs(alignment=alignment, track_id=TRACK_ID_INPUT)

        outputs = run(execution, verbose=verbose, root_node=root_node)[0]
        track = outputs['profile_track']
        seqs[i].add_track(TRACK_ID_PREPROFILE, track)
Пример #6
0
def do_motif_annotation(args, env, manager, seqs, verbose, root_node):
    FMT_TRACK_ID = "{0}_{1}"

    track_scores = {}

    execution = Execution(manager, ROOT_TAG)
    seq_patterns = []
    for seq in seqs:
        for pair in args.patterns:
            pattern = pair[0]
            if len(pair) > 1:
                score = float(pair[1])
            else:
                score = None
            seq_patterns.append((seq, pattern, score))

            component = PrositePatternAnnotator
            task = execution.add_task(component)
            task.environment(env)
            task.inputs(sequence=seq, pattern=pattern,
                        track_id=TRACK_ID_INPUT)

    outputs = run(execution, verbose=verbose, root_node=root_node)
    for n, output in enumerate(outputs):
        seq, pattern, score = seq_patterns[n]

        track = output['prediction_track']

        trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_PATTERN, pattern)
        seq.add_track(trid, track)
        track_scores[trid] = score

    for pair in args.annotation_files:
        annotation_file = pair[0]
        if len(pair) > 1:
            score = float(pair[1])
        else:
            score = None

        annotation_seqs = load_sequence_fasta(annotation_file,
                                              ALPHABET_PROSITE)
        name_tracks = {}
        for annotation_seq in annotation_seqs:
            track = annotation_seq.get_track(TRACK_ID_INPUT)
            name_tracks[annotation_seq.name] = track

        for seq in seqs:
            track = name_tracks[seq.name]

            trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_FILE, annotation_file)
            seq.add_track(trid, track)
            track_scores[trid] = score

    return track_scores
Пример #7
0
def do_multiple_sequence_alignment(args, env, manager, seqs,
                                   track_id_sets, score_matrices,
                                   verbose, root_node):
    if args.pregen_tree:
        # Dummy preprofiles, so we can safely align by sequence.
        sub_env = Environment(parent=env)

        if not args.preprofile_local and not args.preprofile_global:
            sub_env.keys['squash_profiles'] = True

        # Build guide tree
        component = GuideTreeBuilder
        execution = Execution(manager, ROOT_TAG)
        task = execution.add_task(component)
        task.environment(sub_env)
        task.inputs(sequences=seqs, track_id_sets=track_id_sets,
                    score_matrices=score_matrices)

        outputs = run(execution, verbose=verbose, root_node=root_node)[0]

        # Build MSA
        component = TreeMultipleSequenceAligner
        execution = Execution(manager, ROOT_TAG)
        task = execution.add_task(component)
        task.environment(env)
        task.inputs(sequences=seqs, guide_tree=outputs['guide_tree'],
                    track_id_sets=track_id_sets, score_matrices=score_matrices)

        outputs = run(execution, verbose=verbose, root_node=root_node)[0]
    else:
        component = AdHocMultipleSequenceAligner
        execution = Execution(manager, ROOT_TAG)
        task = execution.add_task(component)
        task.environment(env)
        task.inputs(sequences=seqs, track_id_sets=track_id_sets,
                    score_matrices=score_matrices)

        outputs = run(execution, verbose=verbose, root_node=root_node)[0]

    return outputs['alignment']
Пример #8
0
def do_master_slave_alignments(args, env, manager, seqs,
                               track_id_sets, score_matrices, verbose,
                               root_node):
    execution = Execution(manager, ROOT_TAG)

    master_slave_alignments = [None for seq in seqs]
    for master_seq, slave_seqs in seqs:
        component = DummyMasterSlaveAligner

        task = execution.add_task(component)
        task.environment(env)
        task.inputs(master_sequence=master_seq, slave_sequences=slave_seqs,
                    track_id_sets=track_id_sets, score_matrices=score_matrices)

    outputs = run(execution, verbose=verbose, root_node=root_node)
    for n, output in enumerate(outputs):
        master_slave_alignments[n] = output['alignment']

    return master_slave_alignments
Пример #9
0
def do_master_slave_alignments(args, env, manager, seqs, track_id_sets,
                               score_matrices, verbose, root_node):
    execution = Execution(manager, ROOT_TAG)

    master_slave_alignments = [None for seq in seqs]
    for master_seq, slave_seqs in seqs:
        component = DummyMasterSlaveAligner

        task = execution.add_task(component)
        task.environment(env)
        task.inputs(master_sequence=master_seq,
                    slave_sequences=slave_seqs,
                    track_id_sets=track_id_sets,
                    score_matrices=score_matrices)

    outputs = run(execution, verbose=verbose, root_node=root_node)
    for n, output in enumerate(outputs):
        master_slave_alignments[n] = output['alignment']

    return master_slave_alignments
Пример #10
0
def main():
    # Parse arguments.
    args = parse_args()
    verbose = not args.quiet or args.verbose

    # Setup the execution manager.
    index = TypeIndex()
    index.autoregister()
    if args.remote:
        if args.remote_secret is None:
            secret = "__MUCH_SECRITY__"
        else:
            with open(args.remote_secret, 'r') as f:
                secret = f.readline()

        manager = RemoteManager(index, args.remote_host, args.remote_port,
                                secret)
    elif args.num_threads > 1:
        manager = ParallelExecutionManager(index, args.num_threads - 1)
    else:
        manager = Manager(index)

    # Register manager cleanup code at exit.
    atexit.register(_atexit_close_manager, manager=manager)

    # Load inputs and other data.
    with open_resource(args.score_matrix, "matrices") as f:
        score_matrix = load_score_matrix(f, alphabet=ALPHABET_AA)
    seqs = load_sequence_fasta(args.input, ALPHABET_AA)
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['db_name'] = args.psi_blast_db
    keys['num_seqs'] = args.psi_blast_num
    keys['max_evalue'] = args.psi_blast_evalue
    keys['profile_evalue'] = args.psi_blast_inclusion
    keys['num_iterations'] = args.psi_blast_iters
    keys['score_threshold'] = args.preprofile_score
    keys['linkage_method'] = args.tree_linkage
    keys['waterman_eggert_iterations'] = args.num_preprofile_alignments
    keys['aligner'] = PairwiseAligner.tid
    keys['debug'] = args.debug
    if args.merge_semiglobal_auto:
        keys['merge_mode'] = 'semiglobal_auto'
    elif args.merge_semiglobal:
        keys['merge_mode'] = 'semiglobal'
    else:
        keys['merge_mode'] = 'global'

    if args.dist_semiglobal_auto:
        keys['dist_mode'] = 'semiglobal_auto'
    elif args.dist_semiglobal:
        keys['dist_mode'] = 'semiglobal'
    else:
        keys['dist_mode'] = 'global'

    if args.pregen_tree:
        keys['msa_mode'] = 'tree'
    else:
        keys['msa_mode'] = 'ad_hoc'

    if args.preprofile_global:
        keys['preprofile_mode'] = 'global'
    elif args.preprofile_local:
        keys['preprofile_mode'] = 'local'
    else:
        keys['preprofile_mode'] = 'dummy'

    if args.psi_blast:
        keys['run_psi_blast'] = True

    if args.no_accelerate:
        keys['accelerate'] = False
    else:
        keys['accelerate'] = True

    try:
        keys['blast_plus_root'] = os.environ['BLAST_PLUS_ROOT']
    except KeyError:
        pass

    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Run the PRALINE MSA workflow
    component = PralineMultipleSequenceAlignmentWorkflow
    execution = Execution(manager, ROOT_TAG)
    task = execution.add_task(component)
    task.inputs(sequences=seqs, score_matrix=score_matrix)
    task.environment(env)

    outputs = run(execution, verbose, root_node)[0]
    alignment = outputs['alignment']

    # Write alignment to output file.
    outfmt = args.output_format
    if outfmt == 'fasta':
        write_alignment_fasta(args.output, alignment, TRACK_ID_INPUT)
    elif outfmt == "clustal":
        write_alignment_clustal(args.output, alignment, TRACK_ID_INPUT,
                                score_matrix)
    else:
        raise DataError("unknown output format: '{0}'".format(outfmt))

    if verbose:
        sys.stdout.write('\n')

    # Collect log bundles
    if args.debug > 0:
        write_log_structure(root_node)