def collectRecords(self, corrected): # type: (List[Segment]) -> List[LineExtender.Record] sys.stdout.trace("Collecting records", corrected) read_bounds = dict() records = dict() # type: Dict[Segment, LineExtender.Record] good_reads = set() for seg in corrected: sys.stdout.trace("Oppa initial:", seg) seg = seg.expandLeft(params.k) sys.stdout.trace("Alignments relevant for", seg, list(self.dot_plot.allInter(seg))) for al in self.dot_plot.allInter(seg): seg1 = al.matchingSequence().mapSegUp(al.seg_from.contig, seg) line = al.seg_from.contig # type:NewLine for seg_correct in line.correct_segments.allInter(al.seg_from): for seg_resolved in line.completely_resolved.allInter( seg_correct): if seg_resolved in records: continue if seg_resolved.right == len(line): next_start = len(line) else: next = line.completely_resolved.find( line.asSegment().suffix( pos=seg_resolved.right), 1) if next is None: next_start = len(line) else: next_start = next.left next_start = min(next_start, len(line) - 200) focus = line.segment( max(seg_resolved.left, min(seg_resolved.right - params.k, seg1.left)), min(seg_correct.right, next_start + params.k)) als = list(line.getRelevantAlignmentsFor(focus)) reads = ContigStorage() for al in als: reads.add(al.seg_from.contig) als = list( self.aligner.localAlign(reads.unique(), ContigStorage([line]))) final_als = [] sys.stdout.trace("Focus:", focus, seg_resolved) sys.stdout.trace(als) for al in als: if al.seg_to.contig == line.rc: al = al.rc if al.seg_to.interSize(focus) >= params.k - 100: final_als.append(al) sys.stdout.trace(final_als) sys.stdout.trace("Finished realignment of reads") records[seg_resolved] = self.createRecord( seg_resolved, next_start, seg_correct, final_als, good_reads, read_bounds) records = list(records.values()) # type: List[LineExtender.Record] return records
def alsToReads(als): # type: (List[AlignmentPiece]) -> ContigStorage readIds = set() res = ContigStorage() for al in als: if al.seg_from.contig.id in readIds: continue readIds.add(al.seg_from.contig.id) res.add(al.seg_from.contig) return res
def ExtendShortContigs(contigs, reads, aligner, polisher, read_dump): # type: (ContigStorage, ReadCollection, Aligner, Polisher, str) -> None sys.stdout.info("Extending short lines") short_contigs = ContigStorage() als = dict() # type: Dict[str, List[AlignmentPiece]] for contig in contigs.unique(): if len(contig) < params.k + 500: short_contigs.add(contig) als[contig.id] = [] als[contig.rc.id] = [] if read_dump is not None: sys.stdout.trace("Using flye read dump file to extend short contigs") relevant_reads = RelevantReadsFromDump(read_dump, short_contigs, reads) for contig in short_contigs: for al in aligner.overlapAlign(relevant_reads[contig.id], ContigStorage([contig])): als[al.seg_to.contig.id].append(al) als[al.seg_to.contig.rc.id].append(al.rc) else: sys.stdout.trace("Realigning all reads to extend short contigs") for al in aligner.overlapAlign(reads, short_contigs): if al.seg_to.left <= 20 and al.rc.seg_to.left <= 20: added = False for i, al1 in enumerate(als[al.seg_to.contig.id]): if al1.seg_from.contig.id == al.seg_from.contig.id: added = True if al.percentIdentity() > al1.percentIdentity(): als[al.seg_to.contig.id][i] = al als[al.seg_to.contig.rc.id][i] = al.rc break if not added: als[al.seg_to.contig.id].append(al) als[al.seg_to.contig.rc.id].append(al.rc) for contig in short_contigs.unique(): if len(als[contig.id]) > 0: tmp_contig, new_als = polisher.polishEnd(als[contig.id], params.reliable_coverage, max_extension=params.l - len(contig)) r = len(tmp_contig) - len(contig) tmp_contig, new_als = polisher.polishEnd([al.rc for al in new_als], params.reliable_coverage, max_extension=params.l - len(contig)) l = len(tmp_contig) - len(contig) - r else: tmp_contig, new_als = contig, als[contig.id] l = 0 r = 0 # if l > params.k / 2 and r > params.k / 2: # tmp_contig.seq = tmp_contig.seq[l - params.k / 2:-r + params.k / 2] # else: # tmp_contig.seq = tmp_contig.seq[max(0, l - params.k):-max(1, r - params.k)] if len(tmp_contig) > params.k + 500: sys.stdout.info("Prolonged contig", contig.id, "for", l, "and", r, "nucleotides from left and right") contigs.add(Contig(tmp_contig.rc.seq, contig.id)) else: sys.stdout.warn("Could not prolong contig", contig.id, "enough. Removing it.") contigs.remove(contig)
def getRelevantAlignments(self, seg, min_overlap): # type: (Segment, int) -> Generator[AlignmentPiece] sys.stdout.trace("Requesting read alignments for", seg, " using palignments") line = seg.contig #type: NewLine reads = ContigStorage() relevant_reads = line.read_alignments.allInter(seg, min_overlap) sys.stdout.trace("Using reads ", relevant_reads) for base_read_al in relevant_reads: for read in self.als.getAlignments(base_read_al.seg_from.contig.id, params.k): reads.add(read) cnt = 0 for al in self.aligner.localAlign(reads, ContigStorage([seg.contig])): if al.seg_to.interSize(seg) > min_overlap and al.__len__() > params.k: yield al cnt += 1 sys.stdout.trace("Request for read alignments for", seg, "yielded", cnt, "alignments")
def draw(contigs_file, output_dir, k): aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments"))) CreateLog(output_dir) print "Reading contigs" tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")), key=lambda contig: len(contig)) lens = map(len, tmp)[::-1] print lens contigs = ContigStorage() if lens[1::2] == lens[0::2]: tmp = tmp[0::2] print "Removed extra contigs" for i, contig in enumerate(tmp): print i, contig contigs.add(Contig(contig.seq, str(i))) print "Constructing components" componenets = ExtractRepeatComponents(contigs, aligner, k) print "Components:" for comp in componenets: print comp.segments print comp.alignments for cnt, comp in enumerate(componenets): print "Processing component", cnt print comp.segments # print comp.alignments print "Forming blocks" Block.id_cnt = 0 blocks = CreateBlocks(comp) if len(blocks) == 1: print "Skipping trivial repeat" continue for block in blocks: print "Block", block.id, ":", block.segs for block in blocks: for other in block.out: print block.id, "->", other.id print "Placing blocks on X axis" code = placeX(blocks) if code == 1: print "WARNING: component", cnt, "contains cycle. Aborting visualization." continue print "Placing blocks on Y axis" placeY(blocks, comp.segments) print "Printing figure" SimplePrinter().printBlocks(blocks, sys.stdout) print "Finished printing figure"
def recruit(seqs, reads, k, dir): dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k relevant_reads = ContigStorage() disjointigs = seqs for i in range(2): sys.stdout.info("Recruiting iteration", i) als = filter(lambda al: len(al) > k, aligner.localAlign(reads, disjointigs)) print len(als), "alignments" relevant_reads = alsToReads(als) l = sum(map(len, seqs.unique())) disjointigs = constructDisjointigs(relevant_reads, l, dd.nextDir()) print len(disjointigs), "disjointigs" print disjointigs disjointigs.writeToFasta(open(os.path.join(dir, "disjointigs.fasta"), "w")) relevant_reads.writeToFasta(open(os.path.join(dir, "reads.fasta"), "w")) sys.stdout.info("Aligning repeat sequences to disjointigs") als = list(aligner.localAlign(seqs, disjointigs)) print "\n".join(map(str, als)) starts = dict() for dis in disjointigs: starts[dis.id] = len(dis) for al in als: if len(al) > k: starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left) al = al.rc starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left) print "Starts:" for cid, val in starts.items(): print cid, val contigs = ContigStorage() cnt = 1 for dis in disjointigs: if starts[dis.id] > k and starts[dis.id] < len(dis): print cnt, dis.id, starts[dis.id] contigs.add(Contig(dis.prefix(starts[dis.id]).Seq(), str(cnt))) cnt += 1 for dis in disjointigs.unique(): if len(dis) > k and starts[dis.id] == len(dis): print cnt, dis.id contigs.add(Contig(dis.seq, str(cnt))) cnt += 1 contigs.writeToFasta(open(os.path.join(dir, "contigs.fasta"), "w")) fakeGraph(contigs, open(os.path.join(dir, "graph.gv"), "w"))
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique): sys.stdout.info("Creating contig collection") if force_unique is None and not all_unique: graph = SimpleGraph().ReadDot(graph_file) graph.FillSeq(contigs_file) covs = [] for e in graph.e.values(): covs.append((e.len, e.cov)) tmp_cov = [] total = sum(l for c,l in covs) / 2 for l, c in sorted(covs)[::-1]: if total < 0: break tmp_cov.append((l, c)) total -= l avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov) sys.stdout.info("Average coverage determined:", avg_cov) nonunique = set() for edge in graph.e.values(): if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1: if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000): alter = ContigStorage() for e in graph.v[edge.start].out: if e != edge: alter.add(Contig(e.seq, e.id)) for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)): nonunique.add(edge.id) nonunique.add(basic.Reverse(edge.id)) contigs = ContigCollection() for edge in graph.e.values(): if basic.isCanonocal(edge.id): if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0): if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000): if edge.id in nonunique: sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len) else: contigs.add(Contig(edge.seq, edge.id)) else: sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len) elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov): contigs.add(Contig(edge.seq, edge.id)) sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len) elif force_unique is not None: sys.stdout.info("Using forced unique edge set") sys.stdout.trace(force_unique) contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique) else: sys.stdout.info("Considering all contigs unique") contigs = ContigCollection().loadFromFile(contigs_file) # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True) # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20) sys.stdout.info("Created", len(contigs), "initial contigs") if not all_unique or force_unique is not None: sys.stdout.info("Polishing contigs") polished_contigs = polisher.polishMany(reads, list(contigs.unique())) contigs = ContigCollection().addAll(polished_contigs) else: sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used") return contigs
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) # contig = contigs[contig_name].asSegment().prefix(length=2000).asContig() contig = contigs[contig_name] reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) reads1 = ContigStorage() reads2 = ContigStorage() cnt = 0 for read in reads.unique(): cnt += 1 # if cnt % 2 == 0: if read.id in initial_reads1: reads1.add(read) elif read.id in initial_reads2: reads2.add(read) polisher = Polisher(aligner, dd) contig1 = contig contig2 = contig scorer = Scorer() for i in range(3): diff = 0 print "Iteration", i als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig) als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig) contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1") contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2") al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1) als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1) als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2) als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2) als1 = sorted(als1, key = lambda al: al.seg_from.contig.id) als2 = sorted(als2, key = lambda al: al.seg_from.contig.id) reads1 = ContigStorage() reads2 = ContigStorage() dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity() als_map = dict() for al in als1: als_map[al.seg_from.contig.id] = [al] for al in als2: if al.seg_from.contig.id in als_map: als_map[al.seg_from.contig.id].append(al) com_res = [] diffs = [] for tmp_als in als_map.values(): if len(tmp_als) != 2: continue al1 = tmp_als[0] al2 = tmp_als[1] print al1, al2 assert al1.seg_from.contig == al2.seg_from.contig pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity() pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity() com_res.append((al1, al2, pi1 - pi2)) diffs.append(pi1 - pi2) diffs = sorted(diffs) th1 = diffs[len(diffs) / 4] th2 = diffs[len(diffs) * 3 / 4] print "Thresholds:", th1, th2 for al1, al2, diff in com_res: if diff < th1: reads1.add(al1.seg_from.contig) elif diff > th2: reads2.add(al2.seg_from.contig) # if pi1 > pi2 + dp / 4: # reads1.add(al1.seg_from.contig) # elif pi2 > pi1 + dp / 4: # reads2.add(al2.seg_from.contig) # diff += abs(pi1 - pi2) print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2 al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() print al print "\n".join(al.asMatchingStrings2()) for read in reads1: if read.id in initial_reads1: sys.stdout.write(read.id + " ") print "" for read in reads2: if read.id in initial_reads2: sys.stdout.write(read.id + " ") print "" contig1 = prolong(aligner, polisher, contig1, reads1) contig2 = prolong(aligner, polisher, contig2, reads2) contig1.id = "1" contig2.id = "2" out = open(os.path.join(dir, "copies.fasta"), "w") SeqIO.write(contig1, out, "fasta") SeqIO.write(contig2, out, "fasta") out.close() out = open(os.path.join(dir, "reads1.fasta"), "w") for read in reads1.unique(): SeqIO.write(read, out, "fasta") out.close() out = open(os.path.join(dir, "reads2.fasta"), "w") for read in reads2.unique(): SeqIO.write(read, out, "fasta") out.close() print "Finished"
class TestDataset: def __init__(self, genome="", letter_size=550, error_rate=0.05, mutation_rate=0.005, seed=0): random.seed(seed) self.reads = [] # type: List[NamedSequence] self.disjointigs = [] # type: List[NamedSequence] self.contigs = [] # type: List[NamedSequence] self.letter_size = letter_size self.error_rate = error_rate self.mutation_rate = mutation_rate self.alphabet = ContigStorage() self.matches = dict() for c1, c2 in zip(ascii_lowercase, ascii_uppercase): seq = self.generate(self.letter_size) self.alphabet.add(Contig(seq, c1)) seq, matches = self.mutate(seq, self.mutation_rate) self.alphabet.add(Contig(seq, c2)) self.matches[c1] = matches self.matches[c2] = [(b, a) for a, b in matches] self.genome = Contig(self.translate(genome), genome) def translate(self, seq): return "".join(map(lambda c: self.alphabet[c].seq, seq)) def addRead(self, read_seq): name = "R" + str(len(self.reads)) + "_" + read_seq self.reads.append( NamedSequence( self.mutate(self.translate(read_seq), self.error_rate)[0], name)) return name def addDisjointig(self, disjointig_seq): # type: (str) -> str self.disjointigs.append( NamedSequence( self.mutate(self.translate(disjointig_seq), self.mutation_rate)[0], "D" + str(len(self.disjointigs)) + "_" + disjointig_seq)) return self.disjointigs[-1].id def addContig(self, contig_seq): # type: (str) -> str name = "C" + str(len(self.contigs)) + "_" + contig_seq self.contigs.append(NamedSequence(self.translate(contig_seq), name)) return name def generateReads(self, length=5, cov=15, circular=False): genome = self.genome.id if circular: genome = genome + genome[0:length - 1] for i in range(0, len(genome) - length + 1): for j in range((cov + length - 1) / length): self.addRead(genome[i:i + length]) def generate(self, letter_size): # type: (int) -> str return "".join( [random.choice(["A", "C", "G", "T"]) for i in range(letter_size)]) def genAll(self, aligner): # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection] disjointigs = DisjointigCollection() for dis in self.disjointigs: disjointigs.addNew(dis.seq, dis.id) from disjointig_resolve.line_storage import NewLineStorage lines = NewLineStorage(disjointigs, aligner) lines.name_printer = lambda line: line.id + "_" + self.translateBack( line, aligner) for line in self.contigs: new_line = lines.addNew(line.seq, line.id) new_line.initial.add( AlignmentPiece.Identical( new_line.asSegment().asContig().asSegment(), new_line.asSegment())) dp = LineDotPlot(lines, aligner) dp.construct(aligner) lines.alignDisjointigs() reads = ReadCollection() for read in self.reads: reads.addNewRead(read) disjointigs.addAlignments(aligner.localAlign(reads, disjointigs)) return lines, dp, reads def mutate(self, seq, rate): # type: (str, float) -> Tuple[str, List[Tuple[int, int]]] res = [seq[0]] matches = [] matches.append((0, 0)) cur = 1 for i, c in enumerate(seq): if i == 0 or i == len(seq) - 1: continue if random.random() < rate: vars = ["A", "C", "G", "T"] vars.remove(c) res.append(random.choice([random.choice(vars), "", c + c])) cur += len(res[-1]) else: res.append(c) matches.append((cur, i)) cur += 1 res.append(seq[-1]) matches.append((len(seq) - 1, cur)) return "".join(res), matches def saveStructure(self, handler): # type: (TokenWriter) -> None handler.writeToken(self.genome.id) handler.writeInt(len(self.reads)) for read in self.reads: handler.writeToken(read.id.split("_")[-1]) handler.writeInt(len(self.disjointigs)) for disjointig in self.disjointigs: handler.writeToken(disjointig.id.split("_")[-1]) handler.writeInt(len(self.contigs)) for contig in self.contigs: handler.writeToken(contig.id.split("_")[-1]) @staticmethod def loadStructure(handler): # type: (TokenReader) -> TestDataset random.seed(0) res = TestDataset(handler.readToken()) for i in range(handler.readInt()): res.addRead(handler.readToken()) for i in range(handler.readInt()): res.addDisjointig(handler.readToken()) for i in range(handler.readInt()): res.addContig(handler.readToken()) return res def translateBack(self, contig, aligner): # type: (Contig, Aligner) -> str res = [] for al in sorted(aligner.overlapAlign([contig], self.alphabet), key=lambda al: al.seg_from.left): if len(res) > 0 and al.seg_from.interSize( res[-1].seg_from) > self.letter_size / 2: if al.percentIdentity() > res[-1].percentIdentity(): res[-1] = al else: res.append(al) return "".join([al.seg_to.contig.id for al in res])
def splitSeg(aligner, seg, mult, all_reads_list): all_reads = ContigStorage() base = seg.asContig() tmp = [] for al in fixAlDir( aligner.overlapAlign(all_reads_list, ContigStorage([base])), base): if len(al.seg_to) < len(base) - 100: continue all_reads.add(al.seg_from.contig) tmp.append(al.seg_from.contig) all_reads_list = tmp split_reads = [] split_contigs = [] for i in range(mult): split_reads.append([]) split_contigs.append(base) cnt = 0 for read in all_reads_list: split_reads[cnt % mult].append(read) polisher = Polisher(aligner, aligner.dir_distributor) for i in range(10): print "Iteration", i split_contigs = [] for reads in split_reads: tmp_als = fixAlDir( aligner.overlapAlign(reads, ContigStorage([base])), base) split_contigs.append( Contig( polisher.polishSmallSegment(base.asSegment(), tmp_als).seg_from.Seq(), str(len(split_contigs)))) bestals = dict() for read in all_reads_list: bestals[read.id] = None for contig in split_contigs: for al in fixAlDir( aligner.overlapAlign(all_reads_list, ContigStorage([contig])), contig): if len(al.seg_to) < len(base) - 100: continue if al.seg_from.contig.id not in bestals: print bestals.keys() print al if bestals[al.seg_from.contig. id] is None or al.percentIdentity() > bestals[ al.seg_from.contig.id].percentIdentity(): bestals[al.seg_from.contig.id] = al # als.append(fixAlDir(aligner.overlapAlign(all_reads_list, ContigStorage([contig])), contig)) # als[-1] = sorted(als[-1], key = lambda al: al.seg_from.contig.id) for i in range(mult): split_reads[i] = [] for rid in bestals: al = bestals[rid] if al is None: print "Warning: no alignment for read", rid else: split_reads[int(al.seg_to.contig.id)].append( al.seg_from.contig) print " ".join(map(str, map(len, split_reads))) maxpi = 0 print "pi matrix:" for i in range(mult): for j in range(mult): al = aligner.overlapAlign([split_contigs[i]], ContigStorage([split_contigs[j] ])).next() sys.stdout.write(str(al.percentIdentity()) + " ") maxpi = max(maxpi, al.percentIdentity()) print "" print "Maxpi:", maxpi if maxpi < 0.985: return zip(split_contigs, split_reads) else: return None
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length): params.technology = "nano" basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) print " ".join(sys.argv) print "Reading graph" graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")) graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"), True) print "Extracting relevant graph component" edge_ids = edge_id.split(",") to_resolve = to_resolve.split(",") to_resolve = [(a, int(b)) for a, b in zip(to_resolve[0::2], to_resolve[1::2])] unique = uniqueNeighbours(edge_ids, graph, min_contig_length) if rf == "none": return print "Finding reads that align to", edge_ids reads_to_resolve = dict() # type: Dict[str, List[str]] for eid, mult in to_resolve: reads_to_resolve[eid] = [] for unique_edge, initial in unique: reads_to_resolve[initial] = [] relevant_read_ids = set() for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if eid in edge_ids: relevant_read_ids.add(rid) print rid, eid for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if rid in relevant_read_ids and eid in reads_to_resolve: reads_to_resolve[eid].append(rid) for eid in reads_to_resolve: reads_to_resolve[eid] = list(set(reads_to_resolve[eid])) print "Reading reads" res_reads = ContigStorage() res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_by_name(rf): if read.id in relevant_read_ids: res_reads.add(Contig(read.seq, read.id)) SeqIO.write(read, res, "fasta") res.close() random_down = open(os.path.join(dir, "random_down.fasta"), "w") cnt = 0 for read in res_reads: if cnt % 5 == 0: SeqIO.write(read, random_down, "fasta") cnt += 1 random_down.close() res = open(os.path.join(dir, "contigs.fasta"), "w") lcf = open(os.path.join(dir, "contigs.lc"), "w") for eid, mult in to_resolve: repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]] print reads_to_resolve[eid] print map(str, repeat_reads) split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult, repeat_reads, min_contig_length) if split_contigs is None: print "Failed to resove edge", eid, "Aborting" print "Edge", eid, "was split into", mult, "copies" for contig, contig_reads in split_contigs: print contig.id SeqIO.write(contig, res, "fasta") lcf.write(contig.id + "\n") lcf.write(" ".join([r.id for r in contig_reads]) + "\n") res = open(os.path.join(dir, "contigs.fasta"), "w") for unique_edge, initial in unique: print unique_edge.id SeqIO.write(unique_edge, res, "fasta") lcf.write(unique_edge.id + "\n") lcf.write(" ".join(reads_to_resolve[initial]) + "\n") res.close()
def main(contigs_file, contig_name, reads_file, dir, k): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) contig = contigs[contig_name] contigs = ContigStorage() contigs.add(contig) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) als = list(aligner.localAlign(reads.unique(), contigs)) tmp = [] for al in als: if al.seg_to.contig != contig: al = al.rc tmp.append(al) als = tmp als = sorted(als, key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right - al.seg_to.left) counts = dict() for al in als: counts[al.seg_from.contig.id] = 0 for al in als: if len(al) > k: counts[al.seg_from.contig.id] += 1 w = 20 f = open(os.path.join(dir, "reads.fasta"), "w") over = set() inter = set() for al in als: if len(al) < k: continue inter.add(basic.Normalize(al.seg_from.contig.id)) if not al.contradictingRTC(): over.add(basic.Normalize(al.seg_from.contig.id)) m = al.matchingSequence(True) tmp = [] for i in range(len(contig) / w + 1): tmp.append([]) for a, b in m.matches: tmp[b / w].append((a, b)) for i in range(len(contig) / w): if i + 1 < len(tmp) and len(tmp[i + 1]) > 0: tmp[i].append(tmp[i + 1][0]) for i in range(len(contig) / w): seg = contig.segment(i * w, i * w + w) if al.seg_to.inter(seg): if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length: sys.stdout.write("B") elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length: sys.stdout.write("E") else: if len(tmp[i]) == 0: sys.stdout.write("*") else: a = tmp[i][-1][0] - tmp[i][0][0] b = tmp[i][-1][1] - tmp[i][0][1] if a - b > 30: sys.stdout.write("I") elif a - b > 15: sys.stdout.write("i") elif a - b < -30: sys.stdout.write("D") elif a - b < -15: sys.stdout.write("d") else: sys.stdout.write( str(min(8, max(a, b) + 1 - len(tmp[i])))) else: sys.stdout.write("*") print " ", al.seg_from.contig.id, counts[ al.seg_from.contig.id], al.contradictingRTC() print inter for rid in inter: SeqIO.write(reads[rid], f, "fasta") print rid, reads[rid] f.close() f = open(os.path.join(dir, "reads_over.fasta"), "w") for rid in over: SeqIO.write(reads[rid], f, "fasta") f.close()
def main(model_file, k, dir, contigs_file, reads_file): # type: (str, int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.scores = ComplexScores() params.scores.load(open(model, "r")) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) for read in reads: if not basic.isCanonocal(read.id): continue cnt = 0 al0 = None others = [] for al in read.alignments: if not al.contradictingRTC(): cnt += 1 al0 = al else: others.append(al) if cnt != 1 or len(others) == 0: continue print al0 print others seg = al0.seg_from for al in others: if al.seg_from.interSize(seg) < k: seg = None break else: seg = al.seg_from.cap(seg) print seg if seg is None: continue al0 = al0.reduce(query=seg) others = [al.reduce(query=seg) for al in others] scorer = Scorer(params.scores) for al in others: a, b, c = scorer.scoreCommon(al0, al) print "win", a, b, c, len(seg) if len(seg) > 1000: for i in range(len(seg) / 1000): seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000) for al in others: a, b, c = scorer.scoreCommon(al0.reduce(query=seg1), al.reduce(query=seg1)) print "win1000", a, b, c, len(seg1) for al1 in others: for al2 in others: if al1 == al2: continue a, b, c = scorer.scoreCommon(al1, al2) print "draw", a, b, c, len(seg)