def generatePoaGraph(self): """ Given list of subreads, generate MSA using POA graphs """ subreads = list(self.subreads) if self.reference: # the seeded sequence is the reference root_subread = self.reference self.root_subread = root_subread root_seq = str(root_subread.sequence) root_label = root_subread.header else: # no reference provided, align against subread root_subread = subreads.pop() self.root_subread = root_subread root_seq = root_subread.read(aligned=False) root_label = root_subread.qName graph = poagraph.POAGraph(root_seq, label=root_label) for subread in subreads: # uses levenshtein distance to determine if sequence should # be reverse-complemented before being added to the POA MSA subread_seq = self._check_direction(subread.read(aligned=False), root_seq) subread_label = subread.qName alignment = seqgraphalignment.SeqGraphAlignment(subread_seq, graph, fastMethod=True, globalAlign=True) graph.incorporateSeqAlignment(alignment, subread_seq, label=subread_label) return graph
def all_sequences_cost(pads, gid_arr, gid, template): align_cost, cond_int = 0, [] for id in gid: sequence = pads[gid_arr[id]] alignment = seqgraphalignment.SeqGraphAlignment(sequence, template) ac, ci = alignment.alignment_encoding_cost() align_cost += ac cond_int.append(np.array(ci)[:, 0].astype(int)) return align_cost + template.encoding_cost(), cond_int
def sequences_and_test(sequences, test_sequence): graph = generate_poa_graph(sequences) alignment = seqgraphalignment.SeqGraphAlignment(test_sequence, graph, fastMethod=False, globalAlign=True, matchscore=1, mismatchscore=-1, gapscore=-2) graph.incorporateSeqAlignment(alignment, test_sequence, "test") alignments = graph.generateAlignmentStrings() result = alignments[-2][1].replace("-", "") return graph, result
def test_equal_strings(fast, glob, n_sequences): sequence = "TATACCGGCG" sequences = [sequence]*n_sequences graph = poagraph.POAGraph(sequences[0], "0") for i in range(1, len(sequences)): alignment = seqgraphalignment.SeqGraphAlignment(sequences[i], graph, fastMethod=fast, globalAlign=glob, matchscore=1, mismatchscore=-1, gapscore=-2) graph.incorporateSeqAlignment(alignment, sequence, str(i)) alignments = graph.generateAlignmentStrings() matches = [alignments[0][1] == alignstr for _, alignstr in alignments] assert all(matches)
def generate_poa_graph(sequences): """ Initialize graph and align all sequences :param sequences: sequences to align :return: graph: the completed POA graph resulting from the given sequences """ init_sequence = sequences[0] init_label = "0" graph = poagraph.POAGraph(init_sequence, init_label) for i in range(1, len(sequences)): sequence = sequences[i] alignment = seqgraphalignment.SeqGraphAlignment(sequence, graph, fastMethod=False, globalAlign=True, matchscore=1, mismatchscore=-1, gapscore=-2) graph.incorporateSeqAlignment(alignment, sequence, str(i)) return graph
def generatePoaGraph(self): """ Given list of subreads, generate MSA using POA graphs """ subreads = list(self.subreads) root_subread = subreads.pop() self.root_subread = root_subread root_seq = root_subread.read(aligned=False) root_label = root_subread.qName graph = poagraph.POAGraph(root_seq, label=root_label) for subread in subreads: subread_seq = self._check_direction(subread.read(aligned=False), root_seq) subread_label = subread.qName alignment = seqgraphalignment.SeqGraphAlignment(subread_seq, graph, fastMethod=True, globalAlign=True) graph.incorporateSeqAlignment(alignment, subread_seq, label=subread_label) return graph
#!/usr/bin/env python from __future__ import print_function import argparse import sys import numpy import poagraph import seqgraphalignment import simplefasta if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('-g','--globalAlign', action='store_true', help='Global alignment, or (default) local alignment') args = parser.parse_args() seqNo = 0 fasta = simplefasta.readfasta(args.infile) graph = poagraph.POAGraph(fasta[0][1], fasta[0][0]) for label, sequence in fasta[1:]: alignment = seqgraphalignment.SeqGraphAlignment(sequence, graph, globalAlign=args.globalAlign) graph.incorporateSeqAlignment(alignment, sequence, label) alignments = graph.generateAlignmentStrings() for label,alignstring in alignments: print("{0:15s} {1:s}".format(label, alignstring))
default=-1, help='Mismatch score, default=-1') parser.add_argument('-H', '--html', nargs='?', type=argparse.FileType('w'), default='poa.html', help='html output') args = parser.parse_args() seqNo = 0 fasta = simplefasta.readfasta(args.infile) graph = poagraph.POAGraph(fasta[0][1], fasta[0][0]) for label, sequence in fasta[1:]: alignment = seqgraphalignment.SeqGraphAlignment( sequence, graph, fastMethod=not args.simple, globalAlign=args.globalAlign, matchscore=args.match, mismatchscore=args.mismatch, gapscore=args.gap) graph.incorporateSeqAlignment(alignment, sequence, label) alignments = graph.generateAlignmentStrings() for label, alignstring in alignments: print("{0:15s} {1:s}".format(label, alignstring)) if args.html is not None: graph.htmlOutput(args.html)
def InfoShield_MDL(pads, output_path): init_cost = prev_total_cost = np.sum( [sequence_cost(s) for _, s in pads.items()]) + len(pads) gid_arr = np.array([l for l, _ in pads.items()]) start0 = time.time() temp_arr, cond_arr, temp_dict, iter = [], [], {}, 0 while len(gid_arr) > 0: iter += 1 graph, gid = poagraph.POAGraph(pads[gid_arr[0]], gid_arr[0]), [0] seq_total_cost = sequence_cost(pads[gid_arr[0]]) graph_0 = copy.deepcopy(graph) start1 = time.time() for idx, label in enumerate(gid_arr[1:]): sequence = pads[label] alignment = seqgraphalignment.SeqGraphAlignment(sequence, graph_0) align_mdl, _ = alignment.alignment_encoding_cost() seq_cost = sequence_cost(sequence) if align_mdl < seq_cost: gid.append(idx + 1) alignment = seqgraphalignment.SeqGraphAlignment( sequence, graph) graph.incorporateSeqAlignment(alignment, sequence, label) seq_total_cost += seq_cost end1 = time.time() if len(gid) > 1: template, min_cost = dichotomous_search(pads, gid_arr, gid, graph) template = slot_identify(pads, gid_arr, gid, template) align_cost, c_arr = 0, [] for id in gid: sequence = pads[gid_arr[id]] alignment = seqgraphalignment.SeqGraphAlignment( sequence, template) cost, cond = alignment.alignment_encoding_cost() align_cost += cost c_arr.append(cond) total_cost = prev_total_cost - seq_total_cost if len(temp_arr) != 0: total_cost -= log_star(len(temp_arr)) + len(gid_arr) * ceil( np.log2(len(temp_arr))) total_cost += (len(gid_arr) + len(gid)) * ceil( np.log2(len(temp_arr) + 1)) total_cost += log_star(len(temp_arr) + 1) + template.encoding_cost() + align_cost ### Check whether total cost decreases by this template if total_cost < prev_total_cost: prev_total_cost = total_cost temp_arr.append(template) cond_arr.append(c_arr) temp_dict[len(temp_arr)] = gid_arr[gid] ### Delete the assigned sequences gid_arr = np.delete(gid_arr, gid) end0 = time.time() output_results(temp_arr, cond_arr, output_path) return (init_cost - prev_total_cost) / init_cost, temp_dict, end0 - start0