示例#1
0
 def collectRecords(self, corrected):
     # type: (List[Segment]) -> List[LineExtender.Record]
     sys.stdout.trace("Collecting records", corrected)
     read_bounds = dict()
     records = dict()  # type: Dict[Segment, LineExtender.Record]
     good_reads = set()
     for seg in corrected:
         sys.stdout.trace("Oppa initial:", seg)
         seg = seg.expandLeft(params.k)
         sys.stdout.trace("Alignments relevant for", seg,
                          list(self.dot_plot.allInter(seg)))
         for al in self.dot_plot.allInter(seg):
             seg1 = al.matchingSequence().mapSegUp(al.seg_from.contig, seg)
             line = al.seg_from.contig  # type:NewLine
             for seg_correct in line.correct_segments.allInter(al.seg_from):
                 for seg_resolved in line.completely_resolved.allInter(
                         seg_correct):
                     if seg_resolved in records:
                         continue
                     if seg_resolved.right == len(line):
                         next_start = len(line)
                     else:
                         next = line.completely_resolved.find(
                             line.asSegment().suffix(
                                 pos=seg_resolved.right), 1)
                         if next is None:
                             next_start = len(line)
                         else:
                             next_start = next.left
                     next_start = min(next_start, len(line) - 200)
                     focus = line.segment(
                         max(seg_resolved.left,
                             min(seg_resolved.right - params.k, seg1.left)),
                         min(seg_correct.right, next_start + params.k))
                     als = list(line.getRelevantAlignmentsFor(focus))
                     reads = ContigStorage()
                     for al in als:
                         reads.add(al.seg_from.contig)
                     als = list(
                         self.aligner.localAlign(reads.unique(),
                                                 ContigStorage([line])))
                     final_als = []
                     sys.stdout.trace("Focus:", focus, seg_resolved)
                     sys.stdout.trace(als)
                     for al in als:
                         if al.seg_to.contig == line.rc:
                             al = al.rc
                         if al.seg_to.interSize(focus) >= params.k - 100:
                             final_als.append(al)
                     sys.stdout.trace(final_als)
                     sys.stdout.trace("Finished realignment of reads")
                     records[seg_resolved] = self.createRecord(
                         seg_resolved, next_start, seg_correct, final_als,
                         good_reads, read_bounds)
     records = list(records.values())  # type: List[LineExtender.Record]
     return records
示例#2
0
def alsToReads(als):
    # type: (List[AlignmentPiece]) -> ContigStorage
    readIds = set()
    res = ContigStorage()
    for al in als:
        if al.seg_from.contig.id in readIds:
            continue
        readIds.add(al.seg_from.contig.id)
        res.add(al.seg_from.contig)
    return res
示例#3
0
def ExtendShortContigs(contigs, reads, aligner, polisher, read_dump):
    # type: (ContigStorage, ReadCollection, Aligner, Polisher, str) -> None
    sys.stdout.info("Extending short lines")
    short_contigs = ContigStorage()
    als = dict() # type: Dict[str, List[AlignmentPiece]]
    for contig in contigs.unique():
        if len(contig) < params.k + 500:
            short_contigs.add(contig)
            als[contig.id] = []
            als[contig.rc.id] = []

    if read_dump is not None:
        sys.stdout.trace("Using flye read dump file to extend short contigs")
        relevant_reads = RelevantReadsFromDump(read_dump, short_contigs, reads)
        for contig in short_contigs:
            for al in aligner.overlapAlign(relevant_reads[contig.id], ContigStorage([contig])):
                als[al.seg_to.contig.id].append(al)
                als[al.seg_to.contig.rc.id].append(al.rc)
    else:
        sys.stdout.trace("Realigning all reads to extend short contigs")
        for al in aligner.overlapAlign(reads, short_contigs):
            if al.seg_to.left <= 20 and al.rc.seg_to.left <= 20:
                added = False
                for i, al1 in enumerate(als[al.seg_to.contig.id]):
                    if al1.seg_from.contig.id == al.seg_from.contig.id:
                        added = True
                        if al.percentIdentity() > al1.percentIdentity():
                            als[al.seg_to.contig.id][i] = al
                            als[al.seg_to.contig.rc.id][i] = al.rc
                        break
                if not added:
                    als[al.seg_to.contig.id].append(al)
                    als[al.seg_to.contig.rc.id].append(al.rc)
    for contig in short_contigs.unique():
        if len(als[contig.id]) > 0:
            tmp_contig, new_als = polisher.polishEnd(als[contig.id], params.reliable_coverage, max_extension=params.l - len(contig))
            r = len(tmp_contig) - len(contig)
            tmp_contig, new_als = polisher.polishEnd([al.rc for al in new_als], params.reliable_coverage, max_extension=params.l - len(contig))
            l = len(tmp_contig) - len(contig) - r
        else:
            tmp_contig, new_als = contig, als[contig.id]
            l = 0
            r = 0
#        if l > params.k / 2 and r > params.k / 2:
#            tmp_contig.seq = tmp_contig.seq[l - params.k / 2:-r + params.k / 2]
#        else:
#            tmp_contig.seq = tmp_contig.seq[max(0, l - params.k):-max(1, r - params.k)]
        if len(tmp_contig) > params.k + 500:
            sys.stdout.info("Prolonged contig", contig.id, "for", l, "and", r, "nucleotides from left and right")
            contigs.add(Contig(tmp_contig.rc.seq, contig.id))
        else:
            sys.stdout.warn("Could not prolong contig", contig.id, "enough. Removing it.")
            contigs.remove(contig)
示例#4
0
 def getRelevantAlignments(self, seg, min_overlap):
     # type: (Segment, int) -> Generator[AlignmentPiece]
     sys.stdout.trace("Requesting read alignments for", seg, " using palignments")
     line = seg.contig #type: NewLine
     reads = ContigStorage()
     relevant_reads = line.read_alignments.allInter(seg, min_overlap)
     sys.stdout.trace("Using reads ", relevant_reads)
     for base_read_al in relevant_reads:
         for read in self.als.getAlignments(base_read_al.seg_from.contig.id, params.k):
             reads.add(read)
     cnt = 0
     for al in self.aligner.localAlign(reads, ContigStorage([seg.contig])):
         if al.seg_to.interSize(seg) > min_overlap and al.__len__() > params.k:
             yield al
             cnt += 1
     sys.stdout.trace("Request for read alignments for", seg, "yielded", cnt, "alignments")
def draw(contigs_file, output_dir, k):
    aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments")))
    CreateLog(output_dir)
    print "Reading contigs"
    tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")),
                 key=lambda contig: len(contig))
    lens = map(len, tmp)[::-1]
    print lens
    contigs = ContigStorage()
    if lens[1::2] == lens[0::2]:
        tmp = tmp[0::2]
        print "Removed extra contigs"
    for i, contig in enumerate(tmp):
        print i, contig
        contigs.add(Contig(contig.seq, str(i)))
    print "Constructing components"
    componenets = ExtractRepeatComponents(contigs, aligner, k)
    print "Components:"
    for comp in componenets:
        print comp.segments
        print comp.alignments
    for cnt, comp in enumerate(componenets):
        print "Processing component", cnt
        print comp.segments
        # print comp.alignments
        print "Forming blocks"
        Block.id_cnt = 0
        blocks = CreateBlocks(comp)
        if len(blocks) == 1:
            print "Skipping trivial repeat"
            continue
        for block in blocks:
            print "Block", block.id, ":", block.segs
        for block in blocks:
            for other in block.out:
                print block.id, "->", other.id
        print "Placing blocks on X axis"
        code = placeX(blocks)
        if code == 1:
            print "WARNING: component", cnt, "contains cycle. Aborting visualization."
            continue
        print "Placing blocks on Y axis"
        placeY(blocks, comp.segments)
        print "Printing figure"
        SimplePrinter().printBlocks(blocks, sys.stdout)
        print "Finished printing figure"
示例#6
0
def recruit(seqs, reads, k, dir):
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    relevant_reads = ContigStorage()
    disjointigs = seqs
    for i in range(2):
        sys.stdout.info("Recruiting iteration", i)
        als = filter(lambda al: len(al) > k, aligner.localAlign(reads, disjointigs))
        print len(als), "alignments"
        relevant_reads = alsToReads(als)
        l = sum(map(len, seqs.unique()))
        disjointigs = constructDisjointigs(relevant_reads, l, dd.nextDir())
        print len(disjointigs), "disjointigs"
        print disjointigs
    disjointigs.writeToFasta(open(os.path.join(dir, "disjointigs.fasta"), "w"))
    relevant_reads.writeToFasta(open(os.path.join(dir, "reads.fasta"), "w"))
    sys.stdout.info("Aligning repeat sequences to disjointigs")
    als = list(aligner.localAlign(seqs, disjointigs))
    print "\n".join(map(str, als))
    starts = dict()
    for dis in disjointigs:
        starts[dis.id] = len(dis)
    for al in als:
        if len(al) > k:
            starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left)
            al = al.rc
            starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left)
    print "Starts:"
    for cid, val in starts.items():
        print cid, val
    contigs = ContigStorage()
    cnt = 1
    for dis in disjointigs:
        if starts[dis.id] > k and starts[dis.id] < len(dis):
            print cnt, dis.id, starts[dis.id]
            contigs.add(Contig(dis.prefix(starts[dis.id]).Seq(), str(cnt)))
            cnt += 1
    for dis in disjointigs.unique():
        if len(dis) > k and starts[dis.id] == len(dis):
            print cnt, dis.id
            contigs.add(Contig(dis.seq, str(cnt)))
            cnt += 1
    contigs.writeToFasta(open(os.path.join(dir, "contigs.fasta"), "w"))
    fakeGraph(contigs, open(os.path.join(dir, "graph.gv"), "w"))
示例#7
0
def main(k, dir, contigs_file, reads_file):
    # type: (int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        if len(read.alignments) > 1:
            SeqIO.write(read, res, "fasta")
    res.close()
示例#8
0
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique):
    sys.stdout.info("Creating contig collection")
    if force_unique is None and not all_unique:
        graph = SimpleGraph().ReadDot(graph_file)
        graph.FillSeq(contigs_file)
        covs = []
        for e in graph.e.values():
            covs.append((e.len, e.cov))
        tmp_cov = []
        total = sum(l for c,l in covs) / 2
        for l, c in sorted(covs)[::-1]:
            if total < 0:
                break
            tmp_cov.append((l, c))
            total -= l
        avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov)
        sys.stdout.info("Average coverage determined:", avg_cov)
        nonunique = set()
        for edge in graph.e.values():
            if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1:
                if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000):
                    alter = ContigStorage()
                    for e in graph.v[edge.start].out:
                        if e != edge:
                            alter.add(Contig(e.seq, e.id))
                    for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece
                        if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)):
                            nonunique.add(edge.id)
                            nonunique.add(basic.Reverse(edge.id))
        contigs = ContigCollection()
        for edge in graph.e.values():
            if basic.isCanonocal(edge.id):
                if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0):
                    if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000):
                        if edge.id in nonunique:
                            sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len)
                        else:
                            contigs.add(Contig(edge.seq, edge.id))
                    else:
                        sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len)
                elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov):
                    contigs.add(Contig(edge.seq, edge.id))
                    sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len)

    elif force_unique is not None:
        sys.stdout.info("Using forced unique edge set")
        sys.stdout.trace(force_unique)
        contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique)
    else:
        sys.stdout.info("Considering all contigs unique")
        contigs = ContigCollection().loadFromFile(contigs_file)
    # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True)
    # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20)
    sys.stdout.info("Created", len(contigs), "initial contigs")
    if not all_unique or force_unique is not None:
        sys.stdout.info("Polishing contigs")
        polished_contigs = polisher.polishMany(reads, list(contigs.unique()))
        contigs = ContigCollection().addAll(polished_contigs)
    else:
        sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used")
    return contigs
示例#9
0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
#    contig = contigs[contig_name].asSegment().prefix(length=2000).asContig()
    contig = contigs[contig_name]
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    reads1 = ContigStorage()
    reads2 = ContigStorage()
    cnt = 0
    for read in reads.unique():
        cnt += 1
#        if cnt % 2 == 0:
        if read.id in initial_reads1:
            reads1.add(read)
        elif read.id in initial_reads2:
            reads2.add(read)
    polisher = Polisher(aligner, dd)
    contig1 = contig
    contig2 = contig
    scorer = Scorer()
    for i in range(3):
        diff = 0
        print "Iteration", i
        als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig)
        als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig)
        contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1")
        contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2")
        al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
        als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1)
        als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1)
        als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2)
        als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2)
        als1 = sorted(als1, key = lambda al: al.seg_from.contig.id)
        als2 = sorted(als2, key = lambda al: al.seg_from.contig.id)
        reads1 = ContigStorage()
        reads2 = ContigStorage()
        dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity()
        als_map = dict()
        for al in als1:
            als_map[al.seg_from.contig.id] = [al]
        for al in als2:
            if al.seg_from.contig.id in als_map:
                als_map[al.seg_from.contig.id].append(al)
        com_res = []
        diffs = []
        for tmp_als in als_map.values():
            if len(tmp_als) != 2:
                continue
            al1 = tmp_als[0]
            al2 = tmp_als[1]
            print al1, al2
            assert al1.seg_from.contig == al2.seg_from.contig
            pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity()
            pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity()
            com_res.append((al1, al2, pi1 - pi2))
            diffs.append(pi1 - pi2)
        diffs = sorted(diffs)
        th1 = diffs[len(diffs) / 4]
        th2 = diffs[len(diffs) * 3 / 4]
        print "Thresholds:", th1, th2
        for al1, al2, diff in com_res:
            if diff < th1:
                reads1.add(al1.seg_from.contig)
            elif diff > th2:
                reads2.add(al2.seg_from.contig)
#           if pi1 > pi2 + dp / 4:
#               reads1.add(al1.seg_from.contig)
#           elif pi2 > pi1 + dp / 4:
#               reads2.add(al2.seg_from.contig)
#           diff += abs(pi1 - pi2)
        print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2
    al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
    print al
    print "\n".join(al.asMatchingStrings2())
    for read in reads1:
        if read.id in initial_reads1:
            sys.stdout.write(read.id + " ")
    print ""
    for read in reads2:
        if read.id in initial_reads2:
            sys.stdout.write(read.id + " ")
    print ""
    contig1 = prolong(aligner, polisher, contig1, reads1)
    contig2 = prolong(aligner, polisher, contig2, reads2)
    contig1.id = "1"
    contig2.id = "2"
    out = open(os.path.join(dir, "copies.fasta"), "w")
    SeqIO.write(contig1, out, "fasta")
    SeqIO.write(contig2, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads1.fasta"), "w")
    for read in reads1.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads2.fasta"), "w")
    for read in reads2.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    print "Finished"
示例#10
0
class TestDataset:
    def __init__(self,
                 genome="",
                 letter_size=550,
                 error_rate=0.05,
                 mutation_rate=0.005,
                 seed=0):
        random.seed(seed)
        self.reads = []  # type: List[NamedSequence]
        self.disjointigs = []  # type: List[NamedSequence]
        self.contigs = []  # type: List[NamedSequence]
        self.letter_size = letter_size
        self.error_rate = error_rate
        self.mutation_rate = mutation_rate
        self.alphabet = ContigStorage()
        self.matches = dict()
        for c1, c2 in zip(ascii_lowercase, ascii_uppercase):
            seq = self.generate(self.letter_size)
            self.alphabet.add(Contig(seq, c1))
            seq, matches = self.mutate(seq, self.mutation_rate)
            self.alphabet.add(Contig(seq, c2))
            self.matches[c1] = matches
            self.matches[c2] = [(b, a) for a, b in matches]
        self.genome = Contig(self.translate(genome), genome)

    def translate(self, seq):
        return "".join(map(lambda c: self.alphabet[c].seq, seq))

    def addRead(self, read_seq):
        name = "R" + str(len(self.reads)) + "_" + read_seq
        self.reads.append(
            NamedSequence(
                self.mutate(self.translate(read_seq), self.error_rate)[0],
                name))
        return name

    def addDisjointig(self, disjointig_seq):
        # type: (str) -> str
        self.disjointigs.append(
            NamedSequence(
                self.mutate(self.translate(disjointig_seq),
                            self.mutation_rate)[0],
                "D" + str(len(self.disjointigs)) + "_" + disjointig_seq))
        return self.disjointigs[-1].id

    def addContig(self, contig_seq):
        # type: (str) -> str
        name = "C" + str(len(self.contigs)) + "_" + contig_seq
        self.contigs.append(NamedSequence(self.translate(contig_seq), name))
        return name

    def generateReads(self, length=5, cov=15, circular=False):
        genome = self.genome.id
        if circular:
            genome = genome + genome[0:length - 1]
        for i in range(0, len(genome) - length + 1):
            for j in range((cov + length - 1) / length):
                self.addRead(genome[i:i + length])

    def generate(self, letter_size):
        # type: (int) -> str
        return "".join(
            [random.choice(["A", "C", "G", "T"]) for i in range(letter_size)])

    def genAll(self, aligner):
        # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection]
        disjointigs = DisjointigCollection()
        for dis in self.disjointigs:
            disjointigs.addNew(dis.seq, dis.id)
        from disjointig_resolve.line_storage import NewLineStorage
        lines = NewLineStorage(disjointigs, aligner)
        lines.name_printer = lambda line: line.id + "_" + self.translateBack(
            line, aligner)
        for line in self.contigs:
            new_line = lines.addNew(line.seq, line.id)
            new_line.initial.add(
                AlignmentPiece.Identical(
                    new_line.asSegment().asContig().asSegment(),
                    new_line.asSegment()))
        dp = LineDotPlot(lines, aligner)
        dp.construct(aligner)
        lines.alignDisjointigs()
        reads = ReadCollection()
        for read in self.reads:
            reads.addNewRead(read)
        disjointigs.addAlignments(aligner.localAlign(reads, disjointigs))
        return lines, dp, reads

    def mutate(self, seq, rate):
        # type: (str, float) -> Tuple[str, List[Tuple[int, int]]]
        res = [seq[0]]
        matches = []
        matches.append((0, 0))
        cur = 1
        for i, c in enumerate(seq):
            if i == 0 or i == len(seq) - 1:
                continue
            if random.random() < rate:
                vars = ["A", "C", "G", "T"]
                vars.remove(c)
                res.append(random.choice([random.choice(vars), "", c + c]))
                cur += len(res[-1])
            else:
                res.append(c)
                matches.append((cur, i))
                cur += 1
        res.append(seq[-1])
        matches.append((len(seq) - 1, cur))
        return "".join(res), matches

    def saveStructure(self, handler):
        # type: (TokenWriter) -> None
        handler.writeToken(self.genome.id)
        handler.writeInt(len(self.reads))
        for read in self.reads:
            handler.writeToken(read.id.split("_")[-1])
        handler.writeInt(len(self.disjointigs))
        for disjointig in self.disjointigs:
            handler.writeToken(disjointig.id.split("_")[-1])
        handler.writeInt(len(self.contigs))
        for contig in self.contigs:
            handler.writeToken(contig.id.split("_")[-1])

    @staticmethod
    def loadStructure(handler):
        # type: (TokenReader) -> TestDataset
        random.seed(0)
        res = TestDataset(handler.readToken())
        for i in range(handler.readInt()):
            res.addRead(handler.readToken())
        for i in range(handler.readInt()):
            res.addDisjointig(handler.readToken())
        for i in range(handler.readInt()):
            res.addContig(handler.readToken())
        return res

    def translateBack(self, contig, aligner):
        # type: (Contig, Aligner) -> str
        res = []
        for al in sorted(aligner.overlapAlign([contig], self.alphabet),
                         key=lambda al: al.seg_from.left):
            if len(res) > 0 and al.seg_from.interSize(
                    res[-1].seg_from) > self.letter_size / 2:
                if al.percentIdentity() > res[-1].percentIdentity():
                    res[-1] = al
            else:
                res.append(al)
        return "".join([al.seg_to.contig.id for al in res])
示例#11
0
def splitSeg(aligner, seg, mult, all_reads_list):
    all_reads = ContigStorage()
    base = seg.asContig()
    tmp = []
    for al in fixAlDir(
            aligner.overlapAlign(all_reads_list, ContigStorage([base])), base):
        if len(al.seg_to) < len(base) - 100:
            continue
        all_reads.add(al.seg_from.contig)
        tmp.append(al.seg_from.contig)
    all_reads_list = tmp
    split_reads = []
    split_contigs = []
    for i in range(mult):
        split_reads.append([])
        split_contigs.append(base)
    cnt = 0
    for read in all_reads_list:
        split_reads[cnt % mult].append(read)
    polisher = Polisher(aligner, aligner.dir_distributor)
    for i in range(10):
        print "Iteration", i
        split_contigs = []
        for reads in split_reads:
            tmp_als = fixAlDir(
                aligner.overlapAlign(reads, ContigStorage([base])), base)
            split_contigs.append(
                Contig(
                    polisher.polishSmallSegment(base.asSegment(),
                                                tmp_als).seg_from.Seq(),
                    str(len(split_contigs))))
        bestals = dict()
        for read in all_reads_list:
            bestals[read.id] = None
        for contig in split_contigs:
            for al in fixAlDir(
                    aligner.overlapAlign(all_reads_list,
                                         ContigStorage([contig])), contig):
                if len(al.seg_to) < len(base) - 100:
                    continue
                if al.seg_from.contig.id not in bestals:
                    print bestals.keys()
                    print al
                if bestals[al.seg_from.contig.
                           id] is None or al.percentIdentity() > bestals[
                               al.seg_from.contig.id].percentIdentity():
                    bestals[al.seg_from.contig.id] = al


#            als.append(fixAlDir(aligner.overlapAlign(all_reads_list, ContigStorage([contig])), contig))
#            als[-1] = sorted(als[-1], key = lambda al: al.seg_from.contig.id)
        for i in range(mult):
            split_reads[i] = []
        for rid in bestals:
            al = bestals[rid]
            if al is None:
                print "Warning: no alignment for read", rid
            else:
                split_reads[int(al.seg_to.contig.id)].append(
                    al.seg_from.contig)
        print " ".join(map(str, map(len, split_reads)))
    maxpi = 0
    print "pi matrix:"
    for i in range(mult):
        for j in range(mult):
            al = aligner.overlapAlign([split_contigs[i]],
                                      ContigStorage([split_contigs[j]
                                                     ])).next()
            sys.stdout.write(str(al.percentIdentity()) + " ")
            maxpi = max(maxpi, al.percentIdentity())
        print ""
    print "Maxpi:", maxpi
    if maxpi < 0.985:
        return zip(split_contigs, split_reads)
    else:
        return None
示例#12
0
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length):
    params.technology = "nano"
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    print " ".join(sys.argv)
    print "Reading graph"
    graph = SimpleGraph().ReadDot(
        os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv"))
    graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"),
                  True)
    print "Extracting relevant graph component"
    edge_ids = edge_id.split(",")
    to_resolve = to_resolve.split(",")
    to_resolve = [(a, int(b))
                  for a, b in zip(to_resolve[0::2], to_resolve[1::2])]
    unique = uniqueNeighbours(edge_ids, graph, min_contig_length)

    if rf == "none":
        return
    print "Finding reads that align to", edge_ids
    reads_to_resolve = dict()  # type: Dict[str, List[str]]
    for eid, mult in to_resolve:
        reads_to_resolve[eid] = []
    for unique_edge, initial in unique:
        reads_to_resolve[initial] = []
    relevant_read_ids = set()
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if eid in edge_ids:
            relevant_read_ids.add(rid)
            print rid, eid
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if rid in relevant_read_ids and eid in reads_to_resolve:
            reads_to_resolve[eid].append(rid)
    for eid in reads_to_resolve:
        reads_to_resolve[eid] = list(set(reads_to_resolve[eid]))
    print "Reading reads"
    res_reads = ContigStorage()
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_by_name(rf):
        if read.id in relevant_read_ids:
            res_reads.add(Contig(read.seq, read.id))
            SeqIO.write(read, res, "fasta")
    res.close()
    random_down = open(os.path.join(dir, "random_down.fasta"), "w")
    cnt = 0
    for read in res_reads:
        if cnt % 5 == 0:
            SeqIO.write(read, random_down, "fasta")
        cnt += 1
    random_down.close()
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    lcf = open(os.path.join(dir, "contigs.lc"), "w")
    for eid, mult in to_resolve:
        repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]]
        print reads_to_resolve[eid]
        print map(str, repeat_reads)
        split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult,
                                    repeat_reads, min_contig_length)
        if split_contigs is None:
            print "Failed to resove edge", eid, "Aborting"
        print "Edge", eid, "was split into", mult, "copies"
        for contig, contig_reads in split_contigs:
            print contig.id
            SeqIO.write(contig, res, "fasta")
            lcf.write(contig.id + "\n")
            lcf.write(" ".join([r.id for r in contig_reads]) + "\n")
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    for unique_edge, initial in unique:
        print unique_edge.id
        SeqIO.write(unique_edge, res, "fasta")
        lcf.write(unique_edge.id + "\n")
        lcf.write(" ".join(reads_to_resolve[initial]) + "\n")
    res.close()
示例#13
0
def main(contigs_file, contig_name, reads_file, dir, k):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    contig = contigs[contig_name]
    contigs = ContigStorage()
    contigs.add(contig)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    als = list(aligner.localAlign(reads.unique(), contigs))
    tmp = []
    for al in als:
        if al.seg_to.contig != contig:
            al = al.rc
        tmp.append(al)
    als = tmp
    als = sorted(als,
                 key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right
                 - al.seg_to.left)
    counts = dict()
    for al in als:
        counts[al.seg_from.contig.id] = 0
    for al in als:
        if len(al) > k:
            counts[al.seg_from.contig.id] += 1
    w = 20
    f = open(os.path.join(dir, "reads.fasta"), "w")
    over = set()
    inter = set()
    for al in als:
        if len(al) < k:
            continue
        inter.add(basic.Normalize(al.seg_from.contig.id))
        if not al.contradictingRTC():
            over.add(basic.Normalize(al.seg_from.contig.id))
        m = al.matchingSequence(True)
        tmp = []
        for i in range(len(contig) / w + 1):
            tmp.append([])
        for a, b in m.matches:
            tmp[b / w].append((a, b))
        for i in range(len(contig) / w):
            if i + 1 < len(tmp) and len(tmp[i + 1]) > 0:
                tmp[i].append(tmp[i + 1][0])
        for i in range(len(contig) / w):
            seg = contig.segment(i * w, i * w + w)
            if al.seg_to.inter(seg):
                if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length:
                    sys.stdout.write("B")
                elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length:
                    sys.stdout.write("E")
                else:
                    if len(tmp[i]) == 0:
                        sys.stdout.write("*")
                    else:
                        a = tmp[i][-1][0] - tmp[i][0][0]
                        b = tmp[i][-1][1] - tmp[i][0][1]
                        if a - b > 30:
                            sys.stdout.write("I")
                        elif a - b > 15:
                            sys.stdout.write("i")
                        elif a - b < -30:
                            sys.stdout.write("D")
                        elif a - b < -15:
                            sys.stdout.write("d")
                        else:
                            sys.stdout.write(
                                str(min(8,
                                        max(a, b) + 1 - len(tmp[i]))))
            else:
                sys.stdout.write("*")
        print " ", al.seg_from.contig.id, counts[
            al.seg_from.contig.id], al.contradictingRTC()
    print inter
    for rid in inter:
        SeqIO.write(reads[rid], f, "fasta")
        print rid, reads[rid]
    f.close()
    f = open(os.path.join(dir, "reads_over.fasta"), "w")
    for rid in over:
        SeqIO.write(reads[rid], f, "fasta")
    f.close()
def main(model_file, k, dir, contigs_file, reads_file):
    # type: (str, int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.scores = ComplexScores()
    params.scores.load(open(model, "r"))
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        cnt = 0
        al0 = None
        others = []
        for al in read.alignments:
            if not al.contradictingRTC():
                cnt += 1
                al0 = al
            else:
                others.append(al)
        if cnt != 1 or len(others) == 0:
            continue
        print al0
        print others
        seg = al0.seg_from
        for al in others:
            if al.seg_from.interSize(seg) < k:
                seg = None
                break
            else:
                seg = al.seg_from.cap(seg)
        print seg
        if seg is None:
            continue
        al0 = al0.reduce(query=seg)
        others = [al.reduce(query=seg) for al in others]
        scorer = Scorer(params.scores)
        for al in others:
            a, b, c = scorer.scoreCommon(al0, al)
            print "win", a, b, c, len(seg)
        if len(seg) > 1000:
            for i in range(len(seg) / 1000):
                seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000)
                for al in others:
                    a, b, c = scorer.scoreCommon(al0.reduce(query=seg1),
                                                 al.reduce(query=seg1))
                    print "win1000", a, b, c, len(seg1)
        for al1 in others:
            for al2 in others:
                if al1 == al2:
                    continue
                a, b, c = scorer.scoreCommon(al1, al2)
                print "draw", a, b, c, len(seg)