def ReadGFA(self, f): seqs = dict() v = DisjointSet() for s in open(f, "r").readlines(): s = s.split() if s[0] == "S": seqs[s[1]] = s[2] v.add((True, s[1], True)) v.add((True, s[1], False)) v.add((False, s[1], True)) v.add((False, s[1], False)) elif s[0] == "L": v1 = (s[2] == "+", s[1], True) v2 = (s[4] == "+", s[3], False) v.union(v1, v2) v1, v2 = ((not v2[0], v2[1], not v2[2]), (not v1[0], v1[1], not v1[2])) v.union(v1, v2) ids = dict() cnt = 1 for vid, vl in v.listComponenets(): self.AddVertex(str(cnt), str(vid)) ids[vid] = str(cnt) cnt += 1 for eid, seq in seqs.items(): self.AddEdge(eid, ids[v.get((True, eid, False))], ids[v.get( (True, eid, True))], len(seq), eid, seq) self.AddEdge("-" + eid, ids[v.get((False, eid, False))], ids[v.get( (False, eid, True))], len(seq), "-" + eid, basic.RC(seq)) return self
def __init__(self, seq, id, extension_handler, rc = None): # type: (str, str, ExtensionHandler, Optional[NewLine]) -> None self.extensionHandler = extension_handler self.seq = seq self.id = id # type: str self.circular = False self.name_printer = None self.max_extension = False if rc is None: self.initial = AlignmentStorage() self.correct_segments = SegmentStorage() self.completely_resolved = SegmentStorage() self.disjointig_alignments = AlignmentStorage() self.read_alignments = ReadAlignmentStorage() self.listeners = [self.initial, self.correct_segments, self.completely_resolved, self.disjointig_alignments, self.read_alignments, extension_handler] # type: List[LineListener] rc = NewLine(basic.RC(seq), basic.Reverse(self.id), extension_handler.rc, self) #type: NewLine self.rc = rc self.addListener(ReadAlignmentListener(self)) # self.initial.add(AlignmentPiece.Identical(self.asSegment().asContig().asSegment(), self.asSegment())) else: self.initial = rc.initial.rc # type: AlignmentStorage self.correct_segments = rc.correct_segments.rc # type: SegmentStorage self.completely_resolved = rc.completely_resolved.rc # type: SegmentStorage self.disjointig_alignments = rc.disjointig_alignments.rc # type: AlignmentStorage self.read_alignments = rc.read_alignments.rc # type: ReadAlignmentStorage self.listeners = [listener.rc for listener in rc.listeners] # type: List[LineListener] Contig.__init__(self, seq, id, rc) self.rc = rc #type: NewLine self.knot = None # type: Knot
def uniqueNeighbours(edge_ids, graph, min_contig_length): unique = [] for eid in edge_ids: print "Finding neighbours of", eid for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue id = basic.Normalize(e.id) if len(e.seq) < min_contig_length + params.bad_end_length: seq = uniquePathForward( graph, e, min_contig_length + params.bad_end_length) id = id + "p" # seq = e.seq else: seq = e.seq[-min_contig_length - params.bad_end_length:] if e.id.startswith("-"): id = id + "l" else: id = id + "r" if e.id.startswith("-"): seq = basic.RC(seq) print "Right neighbour", eid, id unique.append((NamedSequence(seq, id), basic.Normalize(e.id))) for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue id = basic.Normalize(e.id) if len(e.seq) < min_contig_length + params.bad_end_length: seq = uniquePathBackward( graph, e, min_contig_length + params.bad_end_length) id = id + "p" # seq = e.seq else: seq = e.seq[:min_contig_length + params.bad_end_length] if e.id.startswith("-"): id = id + "r" else: id = id + "l" if e.id.startswith("-"): seq = basic.RC(seq) print "Left neighbour", eid, id unique.append((NamedSequence(seq, id), basic.Normalize(e.id))) return unique
def __init__(self, line_left, line_right, gap, gap_seq = "", rc = None): # type: (NewLine, NewLine, int, str, Knot) -> None assert gap <= 0 or len(gap_seq) > 0 self.line_left = line_left self.line_right = line_right self.gap = gap self.gap_seq = gap_seq if rc is None: rc = Knot(line_right.rc, line_left.rc, gap, basic.RC(gap_seq), self) self.rc = rc
def FillSeq(self, f, numeric=True): for s in SeqIO.parse_fasta(open(f, "r")): if numeric: s.id = str(basic.parseNumber(s.id)) if s.id in self.e: self.e[s.id].seq = s.seq self.e[s.id].len = len(s.seq) if "-" + s.id in self.e: self.e["-" + s.id].seq = basic.RC(s.seq) self.e["-" + s.id].len = len(s.seq) return self
def extendRight(self, seq, relevant_als=None): # type: (str, List[AlignmentPiece]) -> None sys.stdout.trace("Line operation Extend:", self, len(seq), relevant_als) assert self.knot is None if relevant_als is None: relevant_als = [] new_seq = Contig(self.seq + seq, "TMP2_" + self.id) self.notifyBeforeExtendRight(new_seq, seq) self.seq = self.seq + seq self.rc.seq = basic.RC(seq) + self.rc.seq self.notifyAfterExtendRight(seq, relevant_als)
def FillSeq(self, f, numeric=True): for s in SeqIO.parse_fasta(open(f, "r")): if numeric: s.id = str(basic.parseNumber(s.id)) if s.id in self.e: self.e[s.id].seq = s.seq self.e[s.id].len = len(s.seq) if basic.Reverse(s.id) in self.e: self.e[basic.Reverse(s.id)].seq = basic.RC(s.seq) self.e[basic.Reverse(s.id)].len = len(s.seq) for edge in self.e.values(): assert (edge.seq is not None) return self
def __init__(self, seq, id, rc=None): # type: (str, str, Optional[Disjointig]) -> None self.seq = seq self.id = id if rc is None: self.read_alignments = AlignmentStorage() # type: AlignmentStorage rc = Disjointig(basic.RC(seq), basic.Reverse(id), self) # type: Disjointig self.rc = rc else: self.rc = rc self.read_alignments = self.rc.read_alignments.rc # type: AlignmentStorage Contig.__init__(self, seq, id, rc) self.rc = rc # type:Disjointig
def correctSequence(self, alignments): # type: (Iterable[AlignmentPiece]) -> None sys.stdout.trace("Line operation Correct:", alignments) alignments = [al.cutIdenticalEnds() for al in alignments if al.seg_from.Seq() != al.seg_to.Seq()] if len(alignments) == 0: sys.stdout.trace("Skipping trivial correction operation") return assert len(alignments) > 0 correction = Correction.constructCorrection(alignments) self.notifyBeforeCorrect(correction) old = Contig(self.seq, "old") self.seq = correction.seq_from.seq self.rc.seq = basic.RC(self.seq) correction.changeQT(self, old) self.notifyAfterCorrect(correction)
def main(ref_file, segment, dir): ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False) chr1 = ref["chr1"] if segment[0] < 0: segment = (-segment[0], -segment[1]) chr1 = chr1.rc reads = ReadCollection() reads_list = [] for i in range(segment[0], segment[1], 500): read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence()) reads_list.append(read) chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:] chr1.rc.seq = basic.RC(chr1.seq) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, ref) out = sys.stdout for read in reads_list: # print read out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n") out.close()
def correctSequence(self, alignments): # type: (Iterable[AlignmentPiece]) -> None sys.stdout.trace("Line operation Correct:", alignments) alignments = list(alignments) new_alignments = [] for al in alignments: if al.seg_from.Seq() == al.seg_to.Seq(): sys.stdout.trace("Skipping trivial correction alignment", al) else: new_alignments.append(al) if len(new_alignments) == 0: sys.stdout.trace("Skipping trivial correction operation") return assert len(alignments) > 0 correction = Correction.constructCorrection(alignments) self.notifyBeforeCorrect(correction) old = Contig(self.seq, "old") self.seq = correction.seq_from.seq self.rc.seq = basic.RC(self.seq) correction.changeQT(self, old) self.notifyAfterCorrect(correction)
def main(flye_dir, rf, dir, edge_id, k): params.technology = "nano" params.k = k basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) print "Reading graph" graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa")) print "Parsing edge mapping" id_map = parseUPaths(flye_dir) edge_ids = edge_id.split(",") print "Extracting relevant graph component" res = open(os.path.join(dir, "contigs.fasta"), "w") unique = dict() for eid in edge_ids: for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "l"] = NamedSequence( basic.RC(e.seq[:5000]), e.id[1:] + "l") else: unique[e.id + "r"] = NamedSequence(e.seq[-5000:], e.id + "r") for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "r"] = NamedSequence( basic.RC(e.seq[-5000:]), e.id[1:] + "r") else: unique[e.id + "l"] = NamedSequence(e.seq[:5000], e.id + "l") for c in unique.values(): print c.id SeqIO.write(c, res, "fasta") res.close() old_ids = [] for eid in edge_ids: for olde in id_map[eid[len("edge_"):]]: old_ids.append(basic.Normalize(olde)) print "Finding reads that align to", edge_ids print "Old ids:", old_ids relevant_read_ids = set() for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"), "r").readlines(): s = s.split() if s[0] != "Aln": continue if s[6].split("_")[1] in old_ids: relevant_read_ids.add(s[2][1:]) print s[2][1:], s[6].split("_")[1] print "Reading reads" res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_fasta(open(rf, "r")): if read.id in relevant_read_ids and len(read) > k * 1.2: SeqIO.write(read, res, "fasta") res.close()
def RC(self): # type: () -> NamedSequence return NamedSequence(basic.RC(self.seq), basic.Reverse(self.id))
def RC(self): return Consensus(basic.RC(self.seq), self.cov[::-1])
def __init__(self, seq, id, rc=None): # type: (str, str, Optional[Contig]) -> None NamedSequence.__init__(self, seq, id) if rc is None: rc = Contig(basic.RC(seq), basic.Reverse(id), self) self.rc = rc
def main(args): flye_dir = sys.argv[1] repeats, starts, ends = parse(sys.argv[2]) graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir( flye_dir) dump = os.path.join(rep_dir, "read_alignment_dump") reads_file = sys.argv[3] dir = sys.argv[4] CreateLog(dir) print " ".join(args) print "Printing contigs" edges_file = os.path.join(rep_dir, "graph_before_rr.fasta") edges = ContigStorage().loadFromFasta(open(edges_file, "r")) unique = open(os.path.join(dir, "contigs"), "w") for l in starts: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[-15000:] SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique, "fasta") for l in ends: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[:15000] SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"), unique, "fasta") unique.close() print "Selecting reads" reads = set() cur_read = None als = [] for s in open(dump).readlines(): if s.startswith("Chain"): if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print als for al in als: if al in repeats: if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print "oppa" reads.add(cur_read) break als = [] else: s = s.split() cur_read = s[2][1:] eid = s[6].split("_")[1] if s[6][0] == "-": eid = "-" + eid if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print eid als.append(eid) print "Selected", len(reads), "reads" print "\n".join(reads) print "Reading and printing reads" freads = open(os.path.join(dir, "reads.fasta"), "w") cnt = 0 for read in SeqIO.parse_by_name(reads_file): cnt += 1 if cnt % 10000 == 0: print cnt if read.id in reads: SeqIO.write(read, freads, "fasta") freads.close()
def RC(self): # type: () -> SeqRecord qual = self.qual if qual is not None: qual = qual[::-1] return SeqRecord(basic.RC(self.seq), basic.Reverse(self.id), qual)