Exemplo n.º 1
0
def main():
    path = sys.argv[1]
    aa_score_matrix_path = sys.argv[2]
    gap_penalties = [-float(penalty) for penalty in sys.argv[3].split(",")]

    # Load inputs and other data.
    with open_resource(aa_score_matrix_path, "matrices") as f:
        aa_score_matrix = load_score_matrix(f, alphabet=ALPHABET_AA)

    aa_scores, motif_scores = get_aa_motif_scores(path, aa_score_matrix,
                                                  gap_penalties, 1.0, 0.0)

    #print aa_scores[0]
    print(sum(aa_scores[0]))
    print()
    #print motif_scores
    print(sum(sum(track_scores) for track_scores in motif_scores))
Exemplo n.º 2
0
def create_msa_input(job, args, manager, root_node):
    verbose = False
    alphabet = ALPHABET_DNA

    seqs = load_sequence_fasta(args.input, alphabet)

    # Load inputs and other data.
    if args.score_matrix is not None:
        score_matrix_file = args.score_matrix
    else:
        score_matrix_file = 'nucleotide'

    # Read score parameters.
    with open_resource(score_matrix_file, "matrices") as f:
        score_matrices = [load_score_matrix(f, alphabet=alphabet)]
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['debug'] = args.debug
    keys['merge_mode'] = 'global'
    keys['dist_mode'] = 'global'
    keys['accelerate'] = True
    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Annotate the motifs from the files and patterns.
    track_scores = do_motif_annotation(args, env, manager, seqs, verbose,
                                       root_node)
    # Build score matrices.
    motif_score_matrices = {}
    for trid, score in track_scores.iteritems():
        if score is None:
            score = args.motif_match_score

        motif_score_matrices[trid] = get_motif_score_matrix(
            score, args.score_spacers)

    # Add all the new annotation tracks to the list of tracks to use
    # in the alignment.
    track_id_sets = [[TRACK_ID_INPUT]]
    for trid, track in seqs[0].tracks:
        if trid in motif_score_matrices:
            track_id_sets.append([trid])
            score_matrices.append(motif_score_matrices[trid])

    # Build initial sets of which sequences to align against every master
    # sequence. By default, we want to align every input sequence against
    # every other input sequence.
    master_slave_seqs = []
    all_seqs = list(seqs)
    for master_seq in seqs:
        slave_seqs = []
        for slave_seq in seqs:
            if slave_seq is not master_seq:
                slave_seqs.append(slave_seq)
        master_slave_seqs.append((master_seq, slave_seqs))

    master_slave_alignments = do_master_slave_alignments(
        args, env, manager, master_slave_seqs, track_id_sets, score_matrices,
        verbose, root_node)

    # Build preprofiles from master-slave alignments.
    do_preprofiles(args, env, manager, master_slave_alignments, seqs, verbose,
                   root_node)
    msa_track_id_sets = _replace_input_track_id(track_id_sets)

    return env, seqs, msa_track_id_sets, score_matrices
Exemplo n.º 3
0
Arquivo: cmd.py Projeto: ibivu/PRALINE
def main():
    # Parse arguments.
    args = parse_args()
    verbose = not args.quiet or args.verbose

    # Setup the execution manager.
    index = TypeIndex()
    index.autoregister()
    if args.remote:
        if args.remote_secret is None:
            secret = "__MUCH_SECRITY__"
        else:
            with open(args.remote_secret, 'r') as f:
                secret = f.readline()

        manager = RemoteManager(index, args.remote_host, args.remote_port,
                                secret)
    elif args.num_threads > 1:
        manager = ParallelExecutionManager(index, args.num_threads - 1)
    else:
        manager = Manager(index)

    # Register manager cleanup code at exit.
    atexit.register(_atexit_close_manager, manager=manager)

    # Load inputs and other data.
    with open_resource(args.score_matrix, "matrices") as f:
        score_matrix = load_score_matrix(f, alphabet=ALPHABET_AA)
    seqs = load_sequence_fasta(args.input, ALPHABET_AA)
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['db_name'] = args.psi_blast_db
    keys['num_seqs'] = args.psi_blast_num
    keys['max_evalue'] = args.psi_blast_evalue
    keys['profile_evalue'] = args.psi_blast_inclusion
    keys['num_iterations'] = args.psi_blast_iters
    keys['score_threshold'] = args.preprofile_score
    keys['linkage_method'] = args.tree_linkage
    keys['waterman_eggert_iterations'] = args.num_preprofile_alignments
    keys['aligner'] = PairwiseAligner.tid
    keys['debug'] = args.debug
    if args.merge_semiglobal_auto:
        keys['merge_mode'] = 'semiglobal_auto'
    elif args.merge_semiglobal:
        keys['merge_mode'] = 'semiglobal'
    else:
        keys['merge_mode'] = 'global'

    if args.dist_semiglobal_auto:
        keys['dist_mode'] = 'semiglobal_auto'
    elif args.dist_semiglobal:
        keys['dist_mode'] = 'semiglobal'
    else:
        keys['dist_mode'] = 'global'

    if args.pregen_tree:
        keys['msa_mode'] = 'tree'
    else:
        keys['msa_mode'] = 'ad_hoc'

    if args.preprofile_global:
        keys['preprofile_mode'] = 'global'
    elif args.preprofile_local:
        keys['preprofile_mode'] = 'local'
    else:
        keys['preprofile_mode'] = 'dummy'

    if args.psi_blast:
        keys['run_psi_blast'] = True

    if args.no_accelerate:
        keys['accelerate'] = False
    else:
        keys['accelerate'] = True

    try:
        keys['blast_plus_root'] = os.environ['BLAST_PLUS_ROOT']
    except KeyError:
        pass

    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Run the PRALINE MSA workflow
    component = PralineMultipleSequenceAlignmentWorkflow
    execution = Execution(manager, ROOT_TAG)
    task = execution.add_task(component)
    task.inputs(sequences=seqs, score_matrix=score_matrix)
    task.environment(env)

    outputs = run(execution, verbose, root_node)[0]
    alignment = outputs['alignment']

    # Write alignment to output file.
    outfmt = args.output_format
    if outfmt == 'fasta':
        write_alignment_fasta(args.output, alignment, TRACK_ID_INPUT)
    elif outfmt == "clustal":
        write_alignment_clustal(args.output, alignment, TRACK_ID_INPUT,
                                score_matrix)
    else:
        raise DataError("unknown output format: '{0}'".format(outfmt))

    if verbose:
        sys.stdout.write('\n')

    # Collect log bundles
    if args.debug > 0:
        write_log_structure(root_node)
Exemplo n.º 4
0
def main():
    # Parse arguments.
    args = parse_args()
    verbose = not args.quiet or args.verbose

    # See if we're doing DNA or protein alignments.
    # TODO: if unspecified, autodetect this based on input file contents?
    alphabet = None
    if args.input_dna:
        alphabet = ALPHABET_DNA
    elif args.input_protein:
        alphabet = ALPHABET_AA

    # Setup the execution manager.
    index = TypeIndex()
    index.autoregister()
    if args.num_threads > 1:
        manager = ParallelExecutionManager(index, args.num_threads)
    else:
        manager = Manager(index)

    seqs = load_sequence_fasta(args.input, alphabet)

    # Load inputs and other data.
    if args.score_matrix is not None:
        score_matrix_file = args.score_matrix
    else:
        if alphabet == ALPHABET_AA:
            score_matrix_file = 'blosum62'
        elif alphabet == ALPHABET_DNA:
            score_matrix_file = 'nucleotide'

    # Read score parameters.
    with open_resource(score_matrix_file, "matrices") as f:
        score_matrices = [load_score_matrix(f, alphabet=alphabet)]
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['score_threshold'] = args.preprofile_score
    keys['linkage_method'] = args.tree_linkage
    keys['waterman_eggert_iterations'] = args.num_preprofile_alignments
    keys['debug'] = args.debug
    if args.merge_semiglobal:
        keys['merge_mode'] = 'semiglobal'
        keys['dist_mode'] = 'semiglobal'
    else:
        keys['merge_mode'] = 'global'
        keys['dist_mode'] = 'global'

    if args.no_accelerate:
        keys['accelerate'] = False
    else:
        keys['accelerate'] = True
    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Annotate the motifs from the files and patterns.
    track_scores = do_motif_annotation(args, env, manager, seqs,
                                             verbose, root_node)
    # Build score matrices.
    motif_score_matrices = {}
    for trid, score in six.iteritems(track_scores):
        if score is None:
            score = args.motif_match_score

        motif_score_matrices[trid] = get_motif_score_matrix(score,
                                                            args.score_spacers)

    # Add all the new annotation tracks to the list of tracks to use
    # in the alignment.
    track_id_sets = [[TRACK_ID_INPUT]]
    for trid, track in seqs[0].tracks:
        if trid in motif_score_matrices:
            track_id_sets.append([trid])
            score_matrices.append(motif_score_matrices[trid])

    # Build initial sets of which sequences to align against every master
    # sequence. By default, we want to align every input sequence against
    # every other input sequence.
    master_slave_seqs = []
    all_seqs = list(seqs)
    for master_seq in seqs:
        slave_seqs = []
        for slave_seq in seqs:
            if slave_seq is not master_seq:
                slave_seqs.append(slave_seq)
        master_slave_seqs.append((master_seq, slave_seqs))

    master_slave_alignments = do_master_slave_alignments(args, env,
                                                         manager,
                                                         master_slave_seqs,
                                                         track_id_sets,
                                                         score_matrices,
                                                         verbose,
                                                         root_node)

    # Build preprofiles from master-slave alignments.
    do_preprofiles(args, env, manager, master_slave_alignments, seqs,
                   verbose, root_node)
    msa_track_id_sets = _replace_input_track_id(track_id_sets)

    # Do multiple sequence alignment from preprofile-annotated sequences.
    alignment = do_multiple_sequence_alignment(args, env, manager, seqs,
                                               msa_track_id_sets, score_matrices,
                                               verbose, root_node)

    # Write alignment to output file.
    outfmt = args.output_format
    if outfmt == 'fasta':
        write_alignment_fasta(args.output, alignment, TRACK_ID_INPUT)
    elif outfmt == "clustal":
        write_alignment_clustal(args.output, alignment, TRACK_ID_INPUT,
                                score_matrix)
    else:
        raise DataError("unknown output format: '{0}'".format(outfmt))

    # Dump pickled alignment object if user asked for it.
    if args.dump_alignment is not None:
        with open(args.dump_alignment, 'wb') as fo:
            pickle.dump(alignment, fo)

    if args.dump_all_tracks is not None:
        try:
            os.mkdir(args.dump_all_tracks)
        except OSError:
            pass

        all_trids = []
        for trid, track in alignment.items[0].tracks:
            if track.tid == PlainTrack.tid:
                all_trids.append(trid)

        for trid in all_trids:
            filename = "dump-{0}.aln".format(trid)
            path = os.path.join(args.dump_all_tracks, filename)

            if outfmt == "fasta":
                write_alignment_fasta(path, alignment, trid)
            elif outfmt == "clustal":
                write_alignment_clustal(path, alignment, trid, None)
            else:
                raise DataError("unknown output format: '{0}'".format(outfmt))

    if verbose:
        sys.stdout.write('\n')

    # Collect log bundles
    if args.debug > 0:
        write_log_structure(root_node)
Exemplo n.º 5
0
def create_msa_input(job, args, manager, root_node):
    verbose = False
    alphabet = ALPHABET_DNA

    seqs = load_sequence_fasta(args.input, alphabet)

    # Load inputs and other data.
    if args.score_matrix is not None:
        score_matrix_file = args.score_matrix
    else:
        score_matrix_file = 'nucleotide'

    # Read score parameters.
    with open_resource(score_matrix_file, "matrices") as f:
        score_matrices = [load_score_matrix(f, alphabet=alphabet)]
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['debug'] = args.debug
    keys['merge_mode'] = 'global'
    keys['dist_mode'] = 'global'
    keys['accelerate'] = True
    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Annotate the motifs from the files and patterns.
    track_scores = do_motif_annotation(args, env, manager, seqs,
                                             verbose, root_node)
    # Build score matrices.
    motif_score_matrices = {}
    for trid, score in track_scores.iteritems():
        if score is None:
            score = args.motif_match_score

        motif_score_matrices[trid] = get_motif_score_matrix(score,
                                                            args.score_spacers)

    # Add all the new annotation tracks to the list of tracks to use
    # in the alignment.
    track_id_sets = [[TRACK_ID_INPUT]]
    for trid, track in seqs[0].tracks:
        if trid in motif_score_matrices:
            track_id_sets.append([trid])
            score_matrices.append(motif_score_matrices[trid])

    # Build initial sets of which sequences to align against every master
    # sequence. By default, we want to align every input sequence against
    # every other input sequence.
    master_slave_seqs = []
    all_seqs = list(seqs)
    for master_seq in seqs:
        slave_seqs = []
        for slave_seq in seqs:
            if slave_seq is not master_seq:
                slave_seqs.append(slave_seq)
        master_slave_seqs.append((master_seq, slave_seqs))

    master_slave_alignments = do_master_slave_alignments(args, env,
                                                         manager,
                                                         master_slave_seqs,
                                                         track_id_sets,
                                                         score_matrices,
                                                         verbose,
                                                         root_node)

    # Build preprofiles from master-slave alignments.
    do_preprofiles(args, env, manager, master_slave_alignments, seqs,
                   verbose, root_node)
    msa_track_id_sets = _replace_input_track_id(track_id_sets)

    return env, seqs, msa_track_id_sets, score_matrices