예제 #1
0
파일: POA.py 프로젝트: knyquist/biotk
    def generatePoaGraph(self):
        """
        Given list of subreads, generate MSA using POA graphs
        """
        subreads = list(self.subreads)
        if self.reference:
            # the seeded sequence is the reference
            root_subread = self.reference
            self.root_subread = root_subread
            root_seq = str(root_subread.sequence)
            root_label = root_subread.header
        else:
            # no reference provided, align against subread
            root_subread = subreads.pop()
            self.root_subread = root_subread
            root_seq = root_subread.read(aligned=False)
            root_label = root_subread.qName
        graph = poagraph.POAGraph(root_seq, label=root_label)
        for subread in subreads:
            # uses levenshtein distance to determine if sequence should
            # be reverse-complemented before being added to the POA MSA
            subread_seq = self._check_direction(subread.read(aligned=False),
                                                root_seq)

            subread_label = subread.qName
            alignment = seqgraphalignment.SeqGraphAlignment(subread_seq,
                                                            graph,
                                                            fastMethod=True,
                                                            globalAlign=True)
            graph.incorporateSeqAlignment(alignment,
                                          subread_seq,
                                          label=subread_label)

        return graph
def all_sequences_cost(pads, gid_arr, gid, template):
    align_cost, cond_int = 0, []
    for id in gid:
        sequence = pads[gid_arr[id]]
        alignment = seqgraphalignment.SeqGraphAlignment(sequence, template)
        ac, ci = alignment.alignment_encoding_cost()
        align_cost += ac
        cond_int.append(np.array(ci)[:, 0].astype(int))
    return align_cost + template.encoding_cost(), cond_int
예제 #3
0
def sequences_and_test(sequences, test_sequence):
    graph = generate_poa_graph(sequences)
    alignment = seqgraphalignment.SeqGraphAlignment(test_sequence,
                                                    graph,
                                                    fastMethod=False,
                                                    globalAlign=True,
                                                    matchscore=1,
                                                    mismatchscore=-1,
                                                    gapscore=-2)

    graph.incorporateSeqAlignment(alignment, test_sequence, "test")
    alignments = graph.generateAlignmentStrings()

    result = alignments[-2][1].replace("-", "")
    return graph, result
예제 #4
0
def test_equal_strings(fast, glob, n_sequences):
    sequence = "TATACCGGCG"
    sequences = [sequence]*n_sequences

    graph = poagraph.POAGraph(sequences[0], "0")
    for i in range(1, len(sequences)):
        alignment = seqgraphalignment.SeqGraphAlignment(sequences[i], graph,
                                                        fastMethod=fast,
                                                        globalAlign=glob,
                                                        matchscore=1,
                                                        mismatchscore=-1,
                                                        gapscore=-2)

        graph.incorporateSeqAlignment(alignment, sequence, str(i))

    alignments = graph.generateAlignmentStrings()
    matches = [alignments[0][1] == alignstr for _, alignstr in alignments]
    assert all(matches)
예제 #5
0
def generate_poa_graph(sequences):
    """
    Initialize graph and align all sequences
    :param sequences: sequences to align
    :return: graph: the completed POA graph resulting from the given sequences
    """
    init_sequence = sequences[0]
    init_label = "0"

    graph = poagraph.POAGraph(init_sequence, init_label)

    for i in range(1, len(sequences)):
        sequence = sequences[i]
        alignment = seqgraphalignment.SeqGraphAlignment(sequence,
                                                        graph,
                                                        fastMethod=False,
                                                        globalAlign=True,
                                                        matchscore=1,
                                                        mismatchscore=-1,
                                                        gapscore=-2)

        graph.incorporateSeqAlignment(alignment, sequence, str(i))

    return graph
예제 #6
0
    def generatePoaGraph(self):
        """
        Given list of subreads, generate MSA using POA graphs
        """
        subreads = list(self.subreads)
        root_subread = subreads.pop()
        self.root_subread = root_subread
        root_seq = root_subread.read(aligned=False)
        root_label = root_subread.qName
        graph = poagraph.POAGraph(root_seq, label=root_label)
        for subread in subreads:
            subread_seq = self._check_direction(subread.read(aligned=False),
                                                root_seq)

            subread_label = subread.qName
            alignment = seqgraphalignment.SeqGraphAlignment(subread_seq,
                                                            graph,
                                                            fastMethod=True,
                                                            globalAlign=True)
            graph.incorporateSeqAlignment(alignment,
                                          subread_seq,
                                          label=subread_label)

        return graph
예제 #7
0
#!/usr/bin/env python
from __future__ import print_function
import argparse
import sys
import numpy
import poagraph
import seqgraphalignment
import simplefasta

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
    parser.add_argument('-g','--globalAlign', action='store_true', help='Global alignment, or (default) local alignment')
    args = parser.parse_args()

    seqNo = 0
    fasta = simplefasta.readfasta(args.infile)
    graph = poagraph.POAGraph(fasta[0][1], fasta[0][0])
    for label, sequence in fasta[1:]:
        alignment = seqgraphalignment.SeqGraphAlignment(sequence, graph, globalAlign=args.globalAlign)
        graph.incorporateSeqAlignment(alignment, sequence, label)
    alignments = graph.generateAlignmentStrings()
    for label,alignstring in alignments:
        print("{0:15s} {1:s}".format(label, alignstring))

    
예제 #8
0
                        default=-1,
                        help='Mismatch score, default=-1')
    parser.add_argument('-H',
                        '--html',
                        nargs='?',
                        type=argparse.FileType('w'),
                        default='poa.html',
                        help='html output')
    args = parser.parse_args()

    seqNo = 0
    fasta = simplefasta.readfasta(args.infile)
    graph = poagraph.POAGraph(fasta[0][1], fasta[0][0])
    for label, sequence in fasta[1:]:
        alignment = seqgraphalignment.SeqGraphAlignment(
            sequence,
            graph,
            fastMethod=not args.simple,
            globalAlign=args.globalAlign,
            matchscore=args.match,
            mismatchscore=args.mismatch,
            gapscore=args.gap)
        graph.incorporateSeqAlignment(alignment, sequence, label)

    alignments = graph.generateAlignmentStrings()
    for label, alignstring in alignments:
        print("{0:15s} {1:s}".format(label, alignstring))

    if args.html is not None:
        graph.htmlOutput(args.html)
def InfoShield_MDL(pads, output_path):
    init_cost = prev_total_cost = np.sum(
        [sequence_cost(s) for _, s in pads.items()]) + len(pads)
    gid_arr = np.array([l for l, _ in pads.items()])

    start0 = time.time()
    temp_arr, cond_arr, temp_dict, iter = [], [], {}, 0
    while len(gid_arr) > 0:
        iter += 1

        graph, gid = poagraph.POAGraph(pads[gid_arr[0]], gid_arr[0]), [0]
        seq_total_cost = sequence_cost(pads[gid_arr[0]])
        graph_0 = copy.deepcopy(graph)

        start1 = time.time()
        for idx, label in enumerate(gid_arr[1:]):
            sequence = pads[label]
            alignment = seqgraphalignment.SeqGraphAlignment(sequence, graph_0)
            align_mdl, _ = alignment.alignment_encoding_cost()
            seq_cost = sequence_cost(sequence)

            if align_mdl < seq_cost:
                gid.append(idx + 1)
                alignment = seqgraphalignment.SeqGraphAlignment(
                    sequence, graph)
                graph.incorporateSeqAlignment(alignment, sequence, label)
                seq_total_cost += seq_cost
        end1 = time.time()

        if len(gid) > 1:
            template, min_cost = dichotomous_search(pads, gid_arr, gid, graph)
            template = slot_identify(pads, gid_arr, gid, template)

            align_cost, c_arr = 0, []
            for id in gid:
                sequence = pads[gid_arr[id]]
                alignment = seqgraphalignment.SeqGraphAlignment(
                    sequence, template)
                cost, cond = alignment.alignment_encoding_cost()
                align_cost += cost
                c_arr.append(cond)

            total_cost = prev_total_cost - seq_total_cost
            if len(temp_arr) != 0:
                total_cost -= log_star(len(temp_arr)) + len(gid_arr) * ceil(
                    np.log2(len(temp_arr)))
            total_cost += (len(gid_arr) + len(gid)) * ceil(
                np.log2(len(temp_arr) + 1))
            total_cost += log_star(len(temp_arr) +
                                   1) + template.encoding_cost() + align_cost

            ### Check whether total cost decreases by this template
            if total_cost < prev_total_cost:
                prev_total_cost = total_cost
                temp_arr.append(template)
                cond_arr.append(c_arr)
                temp_dict[len(temp_arr)] = gid_arr[gid]

        ### Delete the assigned sequences
        gid_arr = np.delete(gid_arr, gid)

    end0 = time.time()
    output_results(temp_arr, cond_arr, output_path)
    return (init_cost - prev_total_cost) / init_cost, temp_dict, end0 - start0